blob: 101bfbc85a087b4a69bbed2a0158be029907d81a [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Larry Hastings61272b72014-01-07 12:41:53 -080050/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080051class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080052[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080053/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080054
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055/* --- Globals ------------------------------------------------------------
56
Serhiy Storchaka05997252013-01-26 12:14:02 +020057NOTE: In the interpreter's initialization phase, some globals are currently
58 initialized dynamically as needed. In the process Unicode objects may
59 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000060
61*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000063
64#ifdef __cplusplus
65extern "C" {
66#endif
67
Victor Stinner8faf8212011-12-08 22:14:11 +010068/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
69#define MAX_UNICODE 0x10ffff
70
Victor Stinner910337b2011-10-03 03:20:16 +020071#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020072# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020073#else
74# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
75#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020076
Victor Stinnere90fe6a2011-10-01 16:48:13 +020077#define _PyUnicode_UTF8(op) \
78 (((PyCompactUnicodeObject*)(op))->utf8)
79#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020080 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 assert(PyUnicode_IS_READY(op)), \
82 PyUnicode_IS_COMPACT_ASCII(op) ? \
83 ((char*)((PyASCIIObject*)(op) + 1)) : \
84 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020085#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020086 (((PyCompactUnicodeObject*)(op))->utf8_length)
87#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020088 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020089 assert(PyUnicode_IS_READY(op)), \
90 PyUnicode_IS_COMPACT_ASCII(op) ? \
91 ((PyASCIIObject*)(op))->length : \
92 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020093#define _PyUnicode_WSTR(op) \
94 (((PyASCIIObject*)(op))->wstr)
95#define _PyUnicode_WSTR_LENGTH(op) \
96 (((PyCompactUnicodeObject*)(op))->wstr_length)
97#define _PyUnicode_LENGTH(op) \
98 (((PyASCIIObject *)(op))->length)
99#define _PyUnicode_STATE(op) \
100 (((PyASCIIObject *)(op))->state)
101#define _PyUnicode_HASH(op) \
102 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200103#define _PyUnicode_KIND(op) \
104 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200106#define _PyUnicode_GET_LENGTH(op) \
107 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_DATA_ANY(op) \
110 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200111
Victor Stinner910337b2011-10-03 03:20:16 +0200112#undef PyUnicode_READY
113#define PyUnicode_READY(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200116 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100117 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200118
Victor Stinnerc379ead2011-10-03 12:52:27 +0200119#define _PyUnicode_SHARE_UTF8(op) \
120 (assert(_PyUnicode_CHECK(op)), \
121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op) \
124 (assert(_PyUnicode_CHECK(op)), \
125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127/* true if the Unicode object has an allocated UTF-8 memory block
128 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200130 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200131 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
Victor Stinner03490912011-10-03 23:45:12 +0200134/* true if the Unicode object has an allocated wstr memory block
135 (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200137 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200138 (!PyUnicode_IS_READY(op) || \
139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100148 to_type *_to = (to_type *)(to); \
149 const from_type *_iter = (from_type *)(begin); \
150 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 Py_ssize_t n = (_end) - (_iter); \
152 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200153 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200154 while (_iter < (_unrolled_end)) { \
155 _to[0] = (to_type) _iter[0]; \
156 _to[1] = (to_type) _iter[1]; \
157 _to[2] = (to_type) _iter[2]; \
158 _to[3] = (to_type) _iter[3]; \
159 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200160 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200161 while (_iter < (_end)) \
162 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200164
Walter Dörwald16807132007-05-25 13:52:07 +0000165/* This dictionary holds all interned unicode strings. Note that references
166 to strings in this dictionary are *not* counted in the string's ob_refcnt.
167 When the interned string reaches a refcnt of 0 the string deallocation
168 function will delete the reference from this dictionary.
169
170 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000171 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000172*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200173static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000174
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200176static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200177
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200179 do { \
180 if (unicode_empty != NULL) \
181 Py_INCREF(unicode_empty); \
182 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200183 unicode_empty = PyUnicode_New(0, 0); \
184 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200185 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
187 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200189 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190
Serhiy Storchaka678db842013-01-26 12:16:36 +0200191#define _Py_RETURN_UNICODE_EMPTY() \
192 do { \
193 _Py_INCREF_UNICODE_EMPTY(); \
194 return unicode_empty; \
195 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200197/* Forward declaration */
198Py_LOCAL_INLINE(int)
199_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
200
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200201/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200203
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200206static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100242static int unicode_modifiable(PyObject *unicode);
243
Victor Stinnerfe226c02011-10-03 03:52:20 +0200244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100246_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200247static PyObject *
248_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
249static PyObject *
250_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
251
252static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000253unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000254 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100255 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000256 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
257
Alexander Belopolsky40018472011-02-26 01:02:56 +0000258static void
259raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300260 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100261 PyObject *unicode,
262 Py_ssize_t startpos, Py_ssize_t endpos,
263 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000264
Christian Heimes190d79e2008-01-30 11:58:22 +0000265/* Same for linebreaks */
266static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000268/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000269/* 0x000B, * LINE TABULATION */
270/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000271/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000272 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000274/* 0x001C, * FILE SEPARATOR */
275/* 0x001D, * GROUP SEPARATOR */
276/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 0, 0, 0, 0, 1, 1, 1, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000282
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000291};
292
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300293/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
294 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000296PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000297{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000298#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000300#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 /* This is actually an illegal character, so it should
302 not be passed to unichr. */
303 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000304#endif
305}
306
Victor Stinner910337b2011-10-03 03:20:16 +0200307#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200308int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100309_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200310{
311 PyASCIIObject *ascii;
312 unsigned int kind;
313
314 assert(PyUnicode_Check(op));
315
316 ascii = (PyASCIIObject *)op;
317 kind = ascii->state.kind;
318
Victor Stinnera3b334d2011-10-03 13:53:37 +0200319 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200321 assert(ascii->state.ready == 1);
322 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200324 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200325 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200326
Victor Stinnera41463c2011-10-04 01:05:08 +0200327 if (ascii->state.compact == 1) {
328 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200329 assert(kind == PyUnicode_1BYTE_KIND
330 || kind == PyUnicode_2BYTE_KIND
331 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200332 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200333 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100335 }
336 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
338
339 data = unicode->data.any;
340 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100341 assert(ascii->length == 0);
342 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 assert(ascii->state.compact == 0);
344 assert(ascii->state.ascii == 0);
345 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100346 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200347 assert(ascii->wstr != NULL);
348 assert(data == NULL);
349 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200350 }
351 else {
352 assert(kind == PyUnicode_1BYTE_KIND
353 || kind == PyUnicode_2BYTE_KIND
354 || kind == PyUnicode_4BYTE_KIND);
355 assert(ascii->state.compact == 0);
356 assert(ascii->state.ready == 1);
357 assert(data != NULL);
358 if (ascii->state.ascii) {
359 assert (compact->utf8 == data);
360 assert (compact->utf8_length == ascii->length);
361 }
362 else
363 assert (compact->utf8 != data);
364 }
365 }
366 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200367 if (
368#if SIZEOF_WCHAR_T == 2
369 kind == PyUnicode_2BYTE_KIND
370#else
371 kind == PyUnicode_4BYTE_KIND
372#endif
373 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200374 {
375 assert(ascii->wstr == data);
376 assert(compact->wstr_length == ascii->length);
377 } else
378 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200379 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200380
381 if (compact->utf8 == NULL)
382 assert(compact->utf8_length == 0);
383 if (ascii->wstr == NULL)
384 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200385 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200386 /* check that the best kind is used */
387 if (check_content && kind != PyUnicode_WCHAR_KIND)
388 {
389 Py_ssize_t i;
390 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200391 void *data;
392 Py_UCS4 ch;
393
394 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200395 for (i=0; i < ascii->length; i++)
396 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200397 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 if (ch > maxchar)
399 maxchar = ch;
400 }
401 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100402 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200403 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100404 assert(maxchar <= 255);
405 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200406 else
407 assert(maxchar < 128);
408 }
Victor Stinner77faf692011-11-20 18:56:05 +0100409 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200410 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100411 assert(maxchar <= 0xFFFF);
412 }
413 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200414 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100415 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100416 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200417 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200418 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400419 return 1;
420}
Victor Stinner910337b2011-10-03 03:20:16 +0200421#endif
422
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100423static PyObject*
424unicode_result_wchar(PyObject *unicode)
425{
426#ifndef Py_DEBUG
427 Py_ssize_t len;
428
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100429 len = _PyUnicode_WSTR_LENGTH(unicode);
430 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100431 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200432 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 }
434
435 if (len == 1) {
436 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100437 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100438 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
439 Py_DECREF(unicode);
440 return latin1_char;
441 }
442 }
443
444 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200445 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100446 return NULL;
447 }
448#else
Victor Stinneraa771272012-10-04 02:32:58 +0200449 assert(Py_REFCNT(unicode) == 1);
450
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100451 /* don't make the result ready in debug mode to ensure that the caller
452 makes the string ready before using it */
453 assert(_PyUnicode_CheckConsistency(unicode, 1));
454#endif
455 return unicode;
456}
457
458static PyObject*
459unicode_result_ready(PyObject *unicode)
460{
461 Py_ssize_t length;
462
463 length = PyUnicode_GET_LENGTH(unicode);
464 if (length == 0) {
465 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100466 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200467 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100468 }
469 return unicode_empty;
470 }
471
472 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200473 void *data = PyUnicode_DATA(unicode);
474 int kind = PyUnicode_KIND(unicode);
475 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100476 if (ch < 256) {
477 PyObject *latin1_char = unicode_latin1[ch];
478 if (latin1_char != NULL) {
479 if (unicode != latin1_char) {
480 Py_INCREF(latin1_char);
481 Py_DECREF(unicode);
482 }
483 return latin1_char;
484 }
485 else {
486 assert(_PyUnicode_CheckConsistency(unicode, 1));
487 Py_INCREF(unicode);
488 unicode_latin1[ch] = unicode;
489 return unicode;
490 }
491 }
492 }
493
494 assert(_PyUnicode_CheckConsistency(unicode, 1));
495 return unicode;
496}
497
498static PyObject*
499unicode_result(PyObject *unicode)
500{
501 assert(_PyUnicode_CHECK(unicode));
502 if (PyUnicode_IS_READY(unicode))
503 return unicode_result_ready(unicode);
504 else
505 return unicode_result_wchar(unicode);
506}
507
Victor Stinnerc4b49542011-12-11 22:44:26 +0100508static PyObject*
509unicode_result_unchanged(PyObject *unicode)
510{
511 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500512 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100513 return NULL;
514 Py_INCREF(unicode);
515 return unicode;
516 }
517 else
518 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100519 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100520}
521
Victor Stinner3a50e702011-10-18 21:21:00 +0200522#ifdef HAVE_MBCS
523static OSVERSIONINFOEX winver;
524#endif
525
Thomas Wouters477c8d52006-05-27 19:21:47 +0000526/* --- Bloom Filters ----------------------------------------------------- */
527
528/* stuff to implement simple "bloom filters" for Unicode characters.
529 to keep things simple, we use a single bitmask, using the least 5
530 bits from each unicode characters as the bit index. */
531
532/* the linebreak mask is set up by Unicode_Init below */
533
Antoine Pitrouf068f942010-01-13 14:19:12 +0000534#if LONG_BIT >= 128
535#define BLOOM_WIDTH 128
536#elif LONG_BIT >= 64
537#define BLOOM_WIDTH 64
538#elif LONG_BIT >= 32
539#define BLOOM_WIDTH 32
540#else
541#error "LONG_BIT is smaller than 32"
542#endif
543
Thomas Wouters477c8d52006-05-27 19:21:47 +0000544#define BLOOM_MASK unsigned long
545
Serhiy Storchaka05997252013-01-26 12:14:02 +0200546static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
Antoine Pitrouf068f942010-01-13 14:19:12 +0000548#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000549
Benjamin Peterson29060642009-01-31 22:14:21 +0000550#define BLOOM_LINEBREAK(ch) \
551 ((ch) < 128U ? ascii_linebreak[(ch)] : \
552 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000553
Alexander Belopolsky40018472011-02-26 01:02:56 +0000554Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000556{
Victor Stinnera85af502013-04-09 21:53:54 +0200557#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
558 do { \
559 TYPE *data = (TYPE *)PTR; \
560 TYPE *end = data + LEN; \
561 Py_UCS4 ch; \
562 for (; data != end; data++) { \
563 ch = *data; \
564 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
565 } \
566 break; \
567 } while (0)
568
Thomas Wouters477c8d52006-05-27 19:21:47 +0000569 /* calculate simple bloom-style bitmask for a given unicode string */
570
Antoine Pitrouf068f942010-01-13 14:19:12 +0000571 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000572
573 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200574 switch (kind) {
575 case PyUnicode_1BYTE_KIND:
576 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
577 break;
578 case PyUnicode_2BYTE_KIND:
579 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
580 break;
581 case PyUnicode_4BYTE_KIND:
582 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
583 break;
584 default:
585 assert(0);
586 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000587 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200588
589#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000590}
591
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200592/* Compilation of templated routines */
593
594#include "stringlib/asciilib.h"
595#include "stringlib/fastsearch.h"
596#include "stringlib/partition.h"
597#include "stringlib/split.h"
598#include "stringlib/count.h"
599#include "stringlib/find.h"
600#include "stringlib/find_max_char.h"
601#include "stringlib/localeutil.h"
602#include "stringlib/undef.h"
603
604#include "stringlib/ucs1lib.h"
605#include "stringlib/fastsearch.h"
606#include "stringlib/partition.h"
607#include "stringlib/split.h"
608#include "stringlib/count.h"
609#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300610#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200611#include "stringlib/find_max_char.h"
612#include "stringlib/localeutil.h"
613#include "stringlib/undef.h"
614
615#include "stringlib/ucs2lib.h"
616#include "stringlib/fastsearch.h"
617#include "stringlib/partition.h"
618#include "stringlib/split.h"
619#include "stringlib/count.h"
620#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300621#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200622#include "stringlib/find_max_char.h"
623#include "stringlib/localeutil.h"
624#include "stringlib/undef.h"
625
626#include "stringlib/ucs4lib.h"
627#include "stringlib/fastsearch.h"
628#include "stringlib/partition.h"
629#include "stringlib/split.h"
630#include "stringlib/count.h"
631#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300632#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200633#include "stringlib/find_max_char.h"
634#include "stringlib/localeutil.h"
635#include "stringlib/undef.h"
636
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200637#include "stringlib/unicodedefs.h"
638#include "stringlib/fastsearch.h"
639#include "stringlib/count.h"
640#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100641#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643/* --- Unicode Object ----------------------------------------------------- */
644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200646fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200647
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200648Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
649 Py_ssize_t size, Py_UCS4 ch,
650 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200651{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200652 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
653
654 switch (kind) {
655 case PyUnicode_1BYTE_KIND:
656 {
657 Py_UCS1 ch1 = (Py_UCS1) ch;
658 if (ch1 == ch)
659 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
660 else
661 return -1;
662 }
663 case PyUnicode_2BYTE_KIND:
664 {
665 Py_UCS2 ch2 = (Py_UCS2) ch;
666 if (ch2 == ch)
667 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
668 else
669 return -1;
670 }
671 case PyUnicode_4BYTE_KIND:
672 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
673 default:
674 assert(0);
675 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200677}
678
Victor Stinnerafffce42012-10-03 23:03:17 +0200679#ifdef Py_DEBUG
680/* Fill the data of an Unicode string with invalid characters to detect bugs
681 earlier.
682
683 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
684 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
685 invalid character in Unicode 6.0. */
686static void
687unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
688{
689 int kind = PyUnicode_KIND(unicode);
690 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
691 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
692 if (length <= old_length)
693 return;
694 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
695}
696#endif
697
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698static PyObject*
699resize_compact(PyObject *unicode, Py_ssize_t length)
700{
701 Py_ssize_t char_size;
702 Py_ssize_t struct_size;
703 Py_ssize_t new_size;
704 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100705 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200706#ifdef Py_DEBUG
707 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
708#endif
709
Victor Stinner79891572012-05-03 13:43:07 +0200710 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200711 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100712 assert(PyUnicode_IS_COMPACT(unicode));
713
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200714 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100715 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200716 struct_size = sizeof(PyASCIIObject);
717 else
718 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200719 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200720
Victor Stinnerfe226c02011-10-03 03:52:20 +0200721 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
722 PyErr_NoMemory();
723 return NULL;
724 }
725 new_size = (struct_size + (length + 1) * char_size);
726
Victor Stinner84def372011-12-11 20:04:56 +0100727 _Py_DEC_REFTOTAL;
728 _Py_ForgetReference(unicode);
729
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300730 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100731 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100732 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200733 PyErr_NoMemory();
734 return NULL;
735 }
Victor Stinner84def372011-12-11 20:04:56 +0100736 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100738
Victor Stinnerfe226c02011-10-03 03:52:20 +0200739 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200740 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200741 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100742 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200743 _PyUnicode_WSTR_LENGTH(unicode) = length;
744 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100745 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
746 PyObject_DEL(_PyUnicode_WSTR(unicode));
747 _PyUnicode_WSTR(unicode) = NULL;
748 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200749#ifdef Py_DEBUG
750 unicode_fill_invalid(unicode, old_length);
751#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200752 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
753 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200754 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200755 return unicode;
756}
757
Alexander Belopolsky40018472011-02-26 01:02:56 +0000758static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200759resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000760{
Victor Stinner95663112011-10-04 01:03:50 +0200761 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100762 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200763 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000765
Victor Stinnerfe226c02011-10-03 03:52:20 +0200766 if (PyUnicode_IS_READY(unicode)) {
767 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200768 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200770#ifdef Py_DEBUG
771 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
772#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200773
774 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200775 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200776 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
777 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200778
779 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
780 PyErr_NoMemory();
781 return -1;
782 }
783 new_size = (length + 1) * char_size;
784
Victor Stinner7a9105a2011-12-12 00:13:42 +0100785 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
786 {
787 PyObject_DEL(_PyUnicode_UTF8(unicode));
788 _PyUnicode_UTF8(unicode) = NULL;
789 _PyUnicode_UTF8_LENGTH(unicode) = 0;
790 }
791
Victor Stinnerfe226c02011-10-03 03:52:20 +0200792 data = (PyObject *)PyObject_REALLOC(data, new_size);
793 if (data == NULL) {
794 PyErr_NoMemory();
795 return -1;
796 }
797 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200798 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200799 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200800 _PyUnicode_WSTR_LENGTH(unicode) = length;
801 }
802 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200803 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200804 _PyUnicode_UTF8_LENGTH(unicode) = length;
805 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200806 _PyUnicode_LENGTH(unicode) = length;
807 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200808#ifdef Py_DEBUG
809 unicode_fill_invalid(unicode, old_length);
810#endif
Victor Stinner95663112011-10-04 01:03:50 +0200811 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200812 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200813 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200814 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200815 }
Victor Stinner95663112011-10-04 01:03:50 +0200816 assert(_PyUnicode_WSTR(unicode) != NULL);
817
818 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -0700819 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +0200820 PyErr_NoMemory();
821 return -1;
822 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100823 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200824 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100825 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200826 if (!wstr) {
827 PyErr_NoMemory();
828 return -1;
829 }
830 _PyUnicode_WSTR(unicode) = wstr;
831 _PyUnicode_WSTR(unicode)[length] = 0;
832 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200833 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000834 return 0;
835}
836
Victor Stinnerfe226c02011-10-03 03:52:20 +0200837static PyObject*
838resize_copy(PyObject *unicode, Py_ssize_t length)
839{
840 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100841 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200842 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100843
Benjamin Petersonbac79492012-01-14 13:34:47 -0500844 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100845 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200846
847 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
848 if (copy == NULL)
849 return NULL;
850
851 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200852 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200853 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200854 }
855 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200856 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100857
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200858 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200859 if (w == NULL)
860 return NULL;
861 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
862 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200863 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
864 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200865 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200866 }
867}
868
Guido van Rossumd57fd912000-03-10 22:53:23 +0000869/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000870 Ux0000 terminated; some code (e.g. new_identifier)
871 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000872
873 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000874 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000875
876*/
877
Alexander Belopolsky40018472011-02-26 01:02:56 +0000878static PyUnicodeObject *
879_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200881 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200882 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000883
Thomas Wouters477c8d52006-05-27 19:21:47 +0000884 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000885 if (length == 0 && unicode_empty != NULL) {
886 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200887 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000888 }
889
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000890 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -0700891 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000892 return (PyUnicodeObject *)PyErr_NoMemory();
893 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200894 if (length < 0) {
895 PyErr_SetString(PyExc_SystemError,
896 "Negative size passed to _PyUnicode_New");
897 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000898 }
899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200900 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
901 if (unicode == NULL)
902 return NULL;
903 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +0100904
905 _PyUnicode_WSTR_LENGTH(unicode) = length;
906 _PyUnicode_HASH(unicode) = -1;
907 _PyUnicode_STATE(unicode).interned = 0;
908 _PyUnicode_STATE(unicode).kind = 0;
909 _PyUnicode_STATE(unicode).compact = 0;
910 _PyUnicode_STATE(unicode).ready = 0;
911 _PyUnicode_STATE(unicode).ascii = 0;
912 _PyUnicode_DATA_ANY(unicode) = NULL;
913 _PyUnicode_LENGTH(unicode) = 0;
914 _PyUnicode_UTF8(unicode) = NULL;
915 _PyUnicode_UTF8_LENGTH(unicode) = 0;
916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
918 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100919 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000920 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100921 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000922 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200923
Jeremy Hyltond8082792003-09-16 19:41:39 +0000924 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000925 * the caller fails before initializing str -- unicode_resize()
926 * reads str[0], and the Keep-Alive optimization can keep memory
927 * allocated for str alive across a call to unicode_dealloc(unicode).
928 * We don't want unicode_resize to read uninitialized memory in
929 * that case.
930 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200931 _PyUnicode_WSTR(unicode)[0] = 0;
932 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +0100933
Victor Stinner7931d9a2011-11-04 00:22:48 +0100934 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000935 return unicode;
936}
937
Victor Stinnerf42dc442011-10-02 23:33:16 +0200938static const char*
939unicode_kind_name(PyObject *unicode)
940{
Victor Stinner42dfd712011-10-03 14:41:45 +0200941 /* don't check consistency: unicode_kind_name() is called from
942 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200943 if (!PyUnicode_IS_COMPACT(unicode))
944 {
945 if (!PyUnicode_IS_READY(unicode))
946 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600947 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200948 {
949 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200950 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200951 return "legacy ascii";
952 else
953 return "legacy latin1";
954 case PyUnicode_2BYTE_KIND:
955 return "legacy UCS2";
956 case PyUnicode_4BYTE_KIND:
957 return "legacy UCS4";
958 default:
959 return "<legacy invalid kind>";
960 }
961 }
962 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600963 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200964 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200965 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200966 return "ascii";
967 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200968 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200969 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200970 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200971 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200972 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200973 default:
974 return "<invalid compact kind>";
975 }
976}
977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200978#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979/* Functions wrapping macros for use in debugger */
980char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200981 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982}
983
984void *_PyUnicode_compact_data(void *unicode) {
985 return _PyUnicode_COMPACT_DATA(unicode);
986}
987void *_PyUnicode_data(void *unicode){
988 printf("obj %p\n", unicode);
989 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
990 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
991 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
992 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
993 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
994 return PyUnicode_DATA(unicode);
995}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200996
997void
998_PyUnicode_Dump(PyObject *op)
999{
1000 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001001 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1002 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1003 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001004
Victor Stinnera849a4b2011-10-03 12:12:11 +02001005 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001006 {
1007 if (ascii->state.ascii)
1008 data = (ascii + 1);
1009 else
1010 data = (compact + 1);
1011 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001012 else
1013 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001014 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1015 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001016
Victor Stinnera849a4b2011-10-03 12:12:11 +02001017 if (ascii->wstr == data)
1018 printf("shared ");
1019 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001020
Victor Stinnera3b334d2011-10-03 13:53:37 +02001021 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001022 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001023 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1024 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001025 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1026 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001027 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001028 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001029}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001030#endif
1031
1032PyObject *
1033PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1034{
1035 PyObject *obj;
1036 PyCompactUnicodeObject *unicode;
1037 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001038 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001039 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001040 Py_ssize_t char_size;
1041 Py_ssize_t struct_size;
1042
1043 /* Optimization for empty strings */
1044 if (size == 0 && unicode_empty != NULL) {
1045 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001046 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001047 }
1048
Victor Stinner9e9d6892011-10-04 01:02:02 +02001049 is_ascii = 0;
1050 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001051 struct_size = sizeof(PyCompactUnicodeObject);
1052 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001053 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001054 char_size = 1;
1055 is_ascii = 1;
1056 struct_size = sizeof(PyASCIIObject);
1057 }
1058 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001059 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001060 char_size = 1;
1061 }
1062 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001063 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064 char_size = 2;
1065 if (sizeof(wchar_t) == 2)
1066 is_sharing = 1;
1067 }
1068 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001069 if (maxchar > MAX_UNICODE) {
1070 PyErr_SetString(PyExc_SystemError,
1071 "invalid maximum character passed to PyUnicode_New");
1072 return NULL;
1073 }
Victor Stinner8f825062012-04-27 13:55:39 +02001074 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001075 char_size = 4;
1076 if (sizeof(wchar_t) == 4)
1077 is_sharing = 1;
1078 }
1079
1080 /* Ensure we won't overflow the size. */
1081 if (size < 0) {
1082 PyErr_SetString(PyExc_SystemError,
1083 "Negative size passed to PyUnicode_New");
1084 return NULL;
1085 }
1086 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1087 return PyErr_NoMemory();
1088
1089 /* Duplicated allocation code from _PyObject_New() instead of a call to
1090 * PyObject_New() so we are able to allocate space for the object and
1091 * it's data buffer.
1092 */
1093 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1094 if (obj == NULL)
1095 return PyErr_NoMemory();
1096 obj = PyObject_INIT(obj, &PyUnicode_Type);
1097 if (obj == NULL)
1098 return NULL;
1099
1100 unicode = (PyCompactUnicodeObject *)obj;
1101 if (is_ascii)
1102 data = ((PyASCIIObject*)obj) + 1;
1103 else
1104 data = unicode + 1;
1105 _PyUnicode_LENGTH(unicode) = size;
1106 _PyUnicode_HASH(unicode) = -1;
1107 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001108 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001109 _PyUnicode_STATE(unicode).compact = 1;
1110 _PyUnicode_STATE(unicode).ready = 1;
1111 _PyUnicode_STATE(unicode).ascii = is_ascii;
1112 if (is_ascii) {
1113 ((char*)data)[size] = 0;
1114 _PyUnicode_WSTR(unicode) = NULL;
1115 }
Victor Stinner8f825062012-04-27 13:55:39 +02001116 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117 ((char*)data)[size] = 0;
1118 _PyUnicode_WSTR(unicode) = NULL;
1119 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001121 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001122 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001123 else {
1124 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001125 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001126 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001127 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001128 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129 ((Py_UCS4*)data)[size] = 0;
1130 if (is_sharing) {
1131 _PyUnicode_WSTR_LENGTH(unicode) = size;
1132 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1133 }
1134 else {
1135 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1136 _PyUnicode_WSTR(unicode) = NULL;
1137 }
1138 }
Victor Stinner8f825062012-04-27 13:55:39 +02001139#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001140 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001141#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001142 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001143 return obj;
1144}
1145
1146#if SIZEOF_WCHAR_T == 2
1147/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1148 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001149 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001150
1151 This function assumes that unicode can hold one more code point than wstr
1152 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001153static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001154unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001155 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001156{
1157 const wchar_t *iter;
1158 Py_UCS4 *ucs4_out;
1159
Victor Stinner910337b2011-10-03 03:20:16 +02001160 assert(unicode != NULL);
1161 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001162 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1163 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1164
1165 for (iter = begin; iter < end; ) {
1166 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1167 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001168 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1169 && (iter+1) < end
1170 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171 {
Victor Stinner551ac952011-11-29 22:58:13 +01001172 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001173 iter += 2;
1174 }
1175 else {
1176 *ucs4_out++ = *iter;
1177 iter++;
1178 }
1179 }
1180 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1181 _PyUnicode_GET_LENGTH(unicode)));
1182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001183}
1184#endif
1185
Victor Stinnercd9950f2011-10-02 00:34:53 +02001186static int
Victor Stinner488fa492011-12-12 00:01:39 +01001187unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001188{
Victor Stinner488fa492011-12-12 00:01:39 +01001189 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001190 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001191 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001192 return -1;
1193 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001194 return 0;
1195}
1196
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001197static int
1198_copy_characters(PyObject *to, Py_ssize_t to_start,
1199 PyObject *from, Py_ssize_t from_start,
1200 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001201{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001202 unsigned int from_kind, to_kind;
1203 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001204
Victor Stinneree4544c2012-05-09 22:24:08 +02001205 assert(0 <= how_many);
1206 assert(0 <= from_start);
1207 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001208 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001209 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001210 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001211
Victor Stinnerd3f08822012-05-29 12:57:52 +02001212 assert(PyUnicode_Check(to));
1213 assert(PyUnicode_IS_READY(to));
1214 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1215
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001216 if (how_many == 0)
1217 return 0;
1218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001219 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001220 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001221 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001222 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001223
Victor Stinnerf1852262012-06-16 16:38:26 +02001224#ifdef Py_DEBUG
1225 if (!check_maxchar
1226 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1227 {
1228 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1229 Py_UCS4 ch;
1230 Py_ssize_t i;
1231 for (i=0; i < how_many; i++) {
1232 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1233 assert(ch <= to_maxchar);
1234 }
1235 }
1236#endif
1237
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001238 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001239 if (check_maxchar
1240 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1241 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001242 /* Writing Latin-1 characters into an ASCII string requires to
1243 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001244 Py_UCS4 max_char;
1245 max_char = ucs1lib_find_max_char(from_data,
1246 (Py_UCS1*)from_data + how_many);
1247 if (max_char >= 128)
1248 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001249 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001250 Py_MEMCPY((char*)to_data + to_kind * to_start,
1251 (char*)from_data + from_kind * from_start,
1252 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001253 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001254 else if (from_kind == PyUnicode_1BYTE_KIND
1255 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001256 {
1257 _PyUnicode_CONVERT_BYTES(
1258 Py_UCS1, Py_UCS2,
1259 PyUnicode_1BYTE_DATA(from) + from_start,
1260 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1261 PyUnicode_2BYTE_DATA(to) + to_start
1262 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001263 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001264 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001265 && to_kind == PyUnicode_4BYTE_KIND)
1266 {
1267 _PyUnicode_CONVERT_BYTES(
1268 Py_UCS1, Py_UCS4,
1269 PyUnicode_1BYTE_DATA(from) + from_start,
1270 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1271 PyUnicode_4BYTE_DATA(to) + to_start
1272 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001273 }
1274 else if (from_kind == PyUnicode_2BYTE_KIND
1275 && to_kind == PyUnicode_4BYTE_KIND)
1276 {
1277 _PyUnicode_CONVERT_BYTES(
1278 Py_UCS2, Py_UCS4,
1279 PyUnicode_2BYTE_DATA(from) + from_start,
1280 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1281 PyUnicode_4BYTE_DATA(to) + to_start
1282 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001283 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001284 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001285 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1286
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001287 if (!check_maxchar) {
1288 if (from_kind == PyUnicode_2BYTE_KIND
1289 && to_kind == PyUnicode_1BYTE_KIND)
1290 {
1291 _PyUnicode_CONVERT_BYTES(
1292 Py_UCS2, Py_UCS1,
1293 PyUnicode_2BYTE_DATA(from) + from_start,
1294 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1295 PyUnicode_1BYTE_DATA(to) + to_start
1296 );
1297 }
1298 else if (from_kind == PyUnicode_4BYTE_KIND
1299 && to_kind == PyUnicode_1BYTE_KIND)
1300 {
1301 _PyUnicode_CONVERT_BYTES(
1302 Py_UCS4, Py_UCS1,
1303 PyUnicode_4BYTE_DATA(from) + from_start,
1304 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1305 PyUnicode_1BYTE_DATA(to) + to_start
1306 );
1307 }
1308 else if (from_kind == PyUnicode_4BYTE_KIND
1309 && to_kind == PyUnicode_2BYTE_KIND)
1310 {
1311 _PyUnicode_CONVERT_BYTES(
1312 Py_UCS4, Py_UCS2,
1313 PyUnicode_4BYTE_DATA(from) + from_start,
1314 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1315 PyUnicode_2BYTE_DATA(to) + to_start
1316 );
1317 }
1318 else {
1319 assert(0);
1320 return -1;
1321 }
1322 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001323 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001324 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001325 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001326 Py_ssize_t i;
1327
Victor Stinnera0702ab2011-09-29 14:14:38 +02001328 for (i=0; i < how_many; i++) {
1329 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001330 if (ch > to_maxchar)
1331 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001332 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1333 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001334 }
1335 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001336 return 0;
1337}
1338
Victor Stinnerd3f08822012-05-29 12:57:52 +02001339void
1340_PyUnicode_FastCopyCharacters(
1341 PyObject *to, Py_ssize_t to_start,
1342 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001343{
1344 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1345}
1346
1347Py_ssize_t
1348PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1349 PyObject *from, Py_ssize_t from_start,
1350 Py_ssize_t how_many)
1351{
1352 int err;
1353
1354 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1355 PyErr_BadInternalCall();
1356 return -1;
1357 }
1358
Benjamin Petersonbac79492012-01-14 13:34:47 -05001359 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001360 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001361 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001362 return -1;
1363
Victor Stinnerd3f08822012-05-29 12:57:52 +02001364 if (from_start < 0) {
1365 PyErr_SetString(PyExc_IndexError, "string index out of range");
1366 return -1;
1367 }
1368 if (to_start < 0) {
1369 PyErr_SetString(PyExc_IndexError, "string index out of range");
1370 return -1;
1371 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001372 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1373 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1374 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001375 "Cannot write %zi characters at %zi "
1376 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001377 how_many, to_start, PyUnicode_GET_LENGTH(to));
1378 return -1;
1379 }
1380
1381 if (how_many == 0)
1382 return 0;
1383
Victor Stinner488fa492011-12-12 00:01:39 +01001384 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001385 return -1;
1386
1387 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1388 if (err) {
1389 PyErr_Format(PyExc_SystemError,
1390 "Cannot copy %s characters "
1391 "into a string of %s characters",
1392 unicode_kind_name(from),
1393 unicode_kind_name(to));
1394 return -1;
1395 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001396 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001397}
1398
Victor Stinner17222162011-09-28 22:15:37 +02001399/* Find the maximum code point and count the number of surrogate pairs so a
1400 correct string length can be computed before converting a string to UCS4.
1401 This function counts single surrogates as a character and not as a pair.
1402
1403 Return 0 on success, or -1 on error. */
1404static int
1405find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1406 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407{
1408 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001409 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410
Victor Stinnerc53be962011-10-02 21:33:54 +02001411 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001412 *num_surrogates = 0;
1413 *maxchar = 0;
1414
1415 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001417 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1418 && (iter+1) < end
1419 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1420 {
1421 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1422 ++(*num_surrogates);
1423 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 }
1425 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001427 {
1428 ch = *iter;
1429 iter++;
1430 }
1431 if (ch > *maxchar) {
1432 *maxchar = ch;
1433 if (*maxchar > MAX_UNICODE) {
1434 PyErr_Format(PyExc_ValueError,
1435 "character U+%x is not in range [U+0000; U+10ffff]",
1436 ch);
1437 return -1;
1438 }
1439 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440 }
1441 return 0;
1442}
1443
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001444int
1445_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001446{
1447 wchar_t *end;
1448 Py_UCS4 maxchar = 0;
1449 Py_ssize_t num_surrogates;
1450#if SIZEOF_WCHAR_T == 2
1451 Py_ssize_t length_wo_surrogates;
1452#endif
1453
Georg Brandl7597add2011-10-05 16:36:47 +02001454 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001455 strings were created using _PyObject_New() and where no canonical
1456 representation (the str field) has been set yet aka strings
1457 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001458 assert(_PyUnicode_CHECK(unicode));
1459 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001460 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001461 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001462 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001463 /* Actually, it should neither be interned nor be anything else: */
1464 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001465
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001466 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001467 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001468 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001469 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470
1471 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001472 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1473 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001474 PyErr_NoMemory();
1475 return -1;
1476 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001477 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 _PyUnicode_WSTR(unicode), end,
1479 PyUnicode_1BYTE_DATA(unicode));
1480 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1481 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1482 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1483 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001484 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001485 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001486 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 }
1488 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001489 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001490 _PyUnicode_UTF8(unicode) = NULL;
1491 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001492 }
1493 PyObject_FREE(_PyUnicode_WSTR(unicode));
1494 _PyUnicode_WSTR(unicode) = NULL;
1495 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1496 }
1497 /* In this case we might have to convert down from 4-byte native
1498 wchar_t to 2-byte unicode. */
1499 else if (maxchar < 65536) {
1500 assert(num_surrogates == 0 &&
1501 "FindMaxCharAndNumSurrogatePairs() messed up");
1502
Victor Stinner506f5922011-09-28 22:34:18 +02001503#if SIZEOF_WCHAR_T == 2
1504 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001505 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001506 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1507 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1508 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001509 _PyUnicode_UTF8(unicode) = NULL;
1510 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001511#else
1512 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001513 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001514 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001515 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001516 PyErr_NoMemory();
1517 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001518 }
Victor Stinner506f5922011-09-28 22:34:18 +02001519 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1520 _PyUnicode_WSTR(unicode), end,
1521 PyUnicode_2BYTE_DATA(unicode));
1522 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1523 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1524 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001525 _PyUnicode_UTF8(unicode) = NULL;
1526 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001527 PyObject_FREE(_PyUnicode_WSTR(unicode));
1528 _PyUnicode_WSTR(unicode) = NULL;
1529 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1530#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001531 }
1532 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1533 else {
1534#if SIZEOF_WCHAR_T == 2
1535 /* in case the native representation is 2-bytes, we need to allocate a
1536 new normalized 4-byte version. */
1537 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001538 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1539 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001540 PyErr_NoMemory();
1541 return -1;
1542 }
1543 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1544 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001545 _PyUnicode_UTF8(unicode) = NULL;
1546 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001547 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1548 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001549 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001550 PyObject_FREE(_PyUnicode_WSTR(unicode));
1551 _PyUnicode_WSTR(unicode) = NULL;
1552 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1553#else
1554 assert(num_surrogates == 0);
1555
Victor Stinnerc3c74152011-10-02 20:39:55 +02001556 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001557 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001558 _PyUnicode_UTF8(unicode) = NULL;
1559 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001560 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1561#endif
1562 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1563 }
1564 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001565 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001566 return 0;
1567}
1568
Alexander Belopolsky40018472011-02-26 01:02:56 +00001569static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001570unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001571{
Walter Dörwald16807132007-05-25 13:52:07 +00001572 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001573 case SSTATE_NOT_INTERNED:
1574 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001575
Benjamin Peterson29060642009-01-31 22:14:21 +00001576 case SSTATE_INTERNED_MORTAL:
1577 /* revive dead object temporarily for DelItem */
1578 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001579 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001580 Py_FatalError(
1581 "deletion of interned string failed");
1582 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001583
Benjamin Peterson29060642009-01-31 22:14:21 +00001584 case SSTATE_INTERNED_IMMORTAL:
1585 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001586
Benjamin Peterson29060642009-01-31 22:14:21 +00001587 default:
1588 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001589 }
1590
Victor Stinner03490912011-10-03 23:45:12 +02001591 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001592 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001593 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001594 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001595 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1596 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001597
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001598 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001599}
1600
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001601#ifdef Py_DEBUG
1602static int
1603unicode_is_singleton(PyObject *unicode)
1604{
1605 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1606 if (unicode == unicode_empty)
1607 return 1;
1608 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1609 {
1610 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1611 if (ch < 256 && unicode_latin1[ch] == unicode)
1612 return 1;
1613 }
1614 return 0;
1615}
1616#endif
1617
Alexander Belopolsky40018472011-02-26 01:02:56 +00001618static int
Victor Stinner488fa492011-12-12 00:01:39 +01001619unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001620{
Victor Stinner488fa492011-12-12 00:01:39 +01001621 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001622 if (Py_REFCNT(unicode) != 1)
1623 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001624 if (_PyUnicode_HASH(unicode) != -1)
1625 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001626 if (PyUnicode_CHECK_INTERNED(unicode))
1627 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001628 if (!PyUnicode_CheckExact(unicode))
1629 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001630#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001631 /* singleton refcount is greater than 1 */
1632 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001633#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001634 return 1;
1635}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001636
Victor Stinnerfe226c02011-10-03 03:52:20 +02001637static int
1638unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1639{
1640 PyObject *unicode;
1641 Py_ssize_t old_length;
1642
1643 assert(p_unicode != NULL);
1644 unicode = *p_unicode;
1645
1646 assert(unicode != NULL);
1647 assert(PyUnicode_Check(unicode));
1648 assert(0 <= length);
1649
Victor Stinner910337b2011-10-03 03:20:16 +02001650 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001651 old_length = PyUnicode_WSTR_LENGTH(unicode);
1652 else
1653 old_length = PyUnicode_GET_LENGTH(unicode);
1654 if (old_length == length)
1655 return 0;
1656
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001657 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001658 _Py_INCREF_UNICODE_EMPTY();
1659 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001660 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001661 Py_DECREF(*p_unicode);
1662 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001663 return 0;
1664 }
1665
Victor Stinner488fa492011-12-12 00:01:39 +01001666 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001667 PyObject *copy = resize_copy(unicode, length);
1668 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001669 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001670 Py_DECREF(*p_unicode);
1671 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001672 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001673 }
1674
Victor Stinnerfe226c02011-10-03 03:52:20 +02001675 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001676 PyObject *new_unicode = resize_compact(unicode, length);
1677 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001678 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001679 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001680 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001681 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001682 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001683}
1684
Alexander Belopolsky40018472011-02-26 01:02:56 +00001685int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001686PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001687{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001688 PyObject *unicode;
1689 if (p_unicode == NULL) {
1690 PyErr_BadInternalCall();
1691 return -1;
1692 }
1693 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001694 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001695 {
1696 PyErr_BadInternalCall();
1697 return -1;
1698 }
1699 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001700}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001701
Victor Stinnerc5166102012-02-22 13:55:02 +01001702/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001703
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001704 WARNING: The function doesn't copy the terminating null character and
1705 doesn't check the maximum character (may write a latin1 character in an
1706 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001707static void
1708unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1709 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001710{
1711 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1712 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001713 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001714
1715 switch (kind) {
1716 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001717 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001718#ifdef Py_DEBUG
1719 if (PyUnicode_IS_ASCII(unicode)) {
1720 Py_UCS4 maxchar = ucs1lib_find_max_char(
1721 (const Py_UCS1*)str,
1722 (const Py_UCS1*)str + len);
1723 assert(maxchar < 128);
1724 }
1725#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001726 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001727 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001728 }
1729 case PyUnicode_2BYTE_KIND: {
1730 Py_UCS2 *start = (Py_UCS2 *)data + index;
1731 Py_UCS2 *ucs2 = start;
1732 assert(index <= PyUnicode_GET_LENGTH(unicode));
1733
Victor Stinner184252a2012-06-16 02:57:41 +02001734 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001735 *ucs2 = (Py_UCS2)*str;
1736
1737 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001738 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001739 }
1740 default: {
1741 Py_UCS4 *start = (Py_UCS4 *)data + index;
1742 Py_UCS4 *ucs4 = start;
1743 assert(kind == PyUnicode_4BYTE_KIND);
1744 assert(index <= PyUnicode_GET_LENGTH(unicode));
1745
Victor Stinner184252a2012-06-16 02:57:41 +02001746 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001747 *ucs4 = (Py_UCS4)*str;
1748
1749 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001750 }
1751 }
1752}
1753
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754static PyObject*
1755get_latin1_char(unsigned char ch)
1756{
Victor Stinnera464fc12011-10-02 20:39:30 +02001757 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001759 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 if (!unicode)
1761 return NULL;
1762 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001763 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001764 unicode_latin1[ch] = unicode;
1765 }
1766 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001767 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768}
1769
Victor Stinner985a82a2014-01-03 12:53:47 +01001770static PyObject*
1771unicode_char(Py_UCS4 ch)
1772{
1773 PyObject *unicode;
1774
1775 assert(ch <= MAX_UNICODE);
1776
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001777 if (ch < 256)
1778 return get_latin1_char(ch);
1779
Victor Stinner985a82a2014-01-03 12:53:47 +01001780 unicode = PyUnicode_New(1, ch);
1781 if (unicode == NULL)
1782 return NULL;
1783 switch (PyUnicode_KIND(unicode)) {
1784 case PyUnicode_1BYTE_KIND:
1785 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1786 break;
1787 case PyUnicode_2BYTE_KIND:
1788 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1789 break;
1790 default:
1791 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1792 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1793 }
1794 assert(_PyUnicode_CheckConsistency(unicode, 1));
1795 return unicode;
1796}
1797
Alexander Belopolsky40018472011-02-26 01:02:56 +00001798PyObject *
1799PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001800{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001801 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001802 Py_UCS4 maxchar = 0;
1803 Py_ssize_t num_surrogates;
1804
1805 if (u == NULL)
1806 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001807
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001808 /* If the Unicode data is known at construction time, we can apply
1809 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001812 if (size == 0)
1813 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001815 /* Single character Unicode objects in the Latin-1 range are
1816 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001817 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001818 return get_latin1_char((unsigned char)*u);
1819
1820 /* If not empty and not single character, copy the Unicode data
1821 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001822 if (find_maxchar_surrogates(u, u + size,
1823 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001824 return NULL;
1825
Victor Stinner8faf8212011-12-08 22:14:11 +01001826 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001827 if (!unicode)
1828 return NULL;
1829
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001830 switch (PyUnicode_KIND(unicode)) {
1831 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001832 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001833 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1834 break;
1835 case PyUnicode_2BYTE_KIND:
1836#if Py_UNICODE_SIZE == 2
1837 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1838#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001839 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001840 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1841#endif
1842 break;
1843 case PyUnicode_4BYTE_KIND:
1844#if SIZEOF_WCHAR_T == 2
1845 /* This is the only case which has to process surrogates, thus
1846 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001847 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001848#else
1849 assert(num_surrogates == 0);
1850 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1851#endif
1852 break;
1853 default:
1854 assert(0 && "Impossible state");
1855 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001857 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858}
1859
Alexander Belopolsky40018472011-02-26 01:02:56 +00001860PyObject *
1861PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001862{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001863 if (size < 0) {
1864 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001865 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001866 return NULL;
1867 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001868 if (u != NULL)
1869 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1870 else
1871 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001872}
1873
Alexander Belopolsky40018472011-02-26 01:02:56 +00001874PyObject *
1875PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001876{
1877 size_t size = strlen(u);
1878 if (size > PY_SSIZE_T_MAX) {
1879 PyErr_SetString(PyExc_OverflowError, "input too long");
1880 return NULL;
1881 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001882 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001883}
1884
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001885PyObject *
1886_PyUnicode_FromId(_Py_Identifier *id)
1887{
1888 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001889 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1890 strlen(id->string),
1891 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001892 if (!id->object)
1893 return NULL;
1894 PyUnicode_InternInPlace(&id->object);
1895 assert(!id->next);
1896 id->next = static_strings;
1897 static_strings = id;
1898 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001899 return id->object;
1900}
1901
1902void
1903_PyUnicode_ClearStaticStrings()
1904{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001905 _Py_Identifier *tmp, *s = static_strings;
1906 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02001907 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001908 tmp = s->next;
1909 s->next = NULL;
1910 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001911 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001912 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001913}
1914
Benjamin Peterson0df54292012-03-26 14:50:32 -04001915/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001916
Victor Stinnerd3f08822012-05-29 12:57:52 +02001917PyObject*
1918_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001919{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001920 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001921 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001922 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001923#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001924 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001925#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001926 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001927 }
Victor Stinner785938e2011-12-11 20:09:03 +01001928 unicode = PyUnicode_New(size, 127);
1929 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001930 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001931 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1932 assert(_PyUnicode_CheckConsistency(unicode, 1));
1933 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001934}
1935
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001936static Py_UCS4
1937kind_maxchar_limit(unsigned int kind)
1938{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001939 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001940 case PyUnicode_1BYTE_KIND:
1941 return 0x80;
1942 case PyUnicode_2BYTE_KIND:
1943 return 0x100;
1944 case PyUnicode_4BYTE_KIND:
1945 return 0x10000;
1946 default:
1947 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001948 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001949 }
1950}
1951
Victor Stinnere6abb482012-05-02 01:15:40 +02001952Py_LOCAL_INLINE(Py_UCS4)
1953align_maxchar(Py_UCS4 maxchar)
1954{
1955 if (maxchar <= 127)
1956 return 127;
1957 else if (maxchar <= 255)
1958 return 255;
1959 else if (maxchar <= 65535)
1960 return 65535;
1961 else
1962 return MAX_UNICODE;
1963}
1964
Victor Stinner702c7342011-10-05 13:50:52 +02001965static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001966_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001967{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001968 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001969 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001970
Serhiy Storchaka678db842013-01-26 12:16:36 +02001971 if (size == 0)
1972 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001973 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001974 if (size == 1)
1975 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001976
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001977 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001978 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001979 if (!res)
1980 return NULL;
1981 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001982 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001983 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001984}
1985
Victor Stinnere57b1c02011-09-28 22:20:48 +02001986static PyObject*
1987_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001988{
1989 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001990 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001991
Serhiy Storchaka678db842013-01-26 12:16:36 +02001992 if (size == 0)
1993 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001994 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01001995 if (size == 1)
1996 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001997
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001998 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001999 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002000 if (!res)
2001 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002002 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002003 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002004 else {
2005 _PyUnicode_CONVERT_BYTES(
2006 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2007 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002008 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002009 return res;
2010}
2011
Victor Stinnere57b1c02011-09-28 22:20:48 +02002012static PyObject*
2013_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002014{
2015 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002016 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002017
Serhiy Storchaka678db842013-01-26 12:16:36 +02002018 if (size == 0)
2019 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002020 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002021 if (size == 1)
2022 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002023
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002024 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002025 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026 if (!res)
2027 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002028 if (max_char < 256)
2029 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2030 PyUnicode_1BYTE_DATA(res));
2031 else if (max_char < 0x10000)
2032 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2033 PyUnicode_2BYTE_DATA(res));
2034 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002035 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002036 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037 return res;
2038}
2039
2040PyObject*
2041PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2042{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002043 if (size < 0) {
2044 PyErr_SetString(PyExc_ValueError, "size must be positive");
2045 return NULL;
2046 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002047 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002048 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002049 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002051 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002052 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002053 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002054 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002055 PyErr_SetString(PyExc_SystemError, "invalid kind");
2056 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002057 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002058}
2059
Victor Stinnerece58de2012-04-23 23:36:38 +02002060Py_UCS4
2061_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2062{
2063 enum PyUnicode_Kind kind;
2064 void *startptr, *endptr;
2065
2066 assert(PyUnicode_IS_READY(unicode));
2067 assert(0 <= start);
2068 assert(end <= PyUnicode_GET_LENGTH(unicode));
2069 assert(start <= end);
2070
2071 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2072 return PyUnicode_MAX_CHAR_VALUE(unicode);
2073
2074 if (start == end)
2075 return 127;
2076
Victor Stinner94d558b2012-04-27 22:26:58 +02002077 if (PyUnicode_IS_ASCII(unicode))
2078 return 127;
2079
Victor Stinnerece58de2012-04-23 23:36:38 +02002080 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002081 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002082 endptr = (char *)startptr + end * kind;
2083 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002084 switch(kind) {
2085 case PyUnicode_1BYTE_KIND:
2086 return ucs1lib_find_max_char(startptr, endptr);
2087 case PyUnicode_2BYTE_KIND:
2088 return ucs2lib_find_max_char(startptr, endptr);
2089 case PyUnicode_4BYTE_KIND:
2090 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002091 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002092 assert(0);
2093 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002094 }
2095}
2096
Victor Stinner25a4b292011-10-06 12:31:55 +02002097/* Ensure that a string uses the most efficient storage, if it is not the
2098 case: create a new string with of the right kind. Write NULL into *p_unicode
2099 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002100static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002101unicode_adjust_maxchar(PyObject **p_unicode)
2102{
2103 PyObject *unicode, *copy;
2104 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002105 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002106 unsigned int kind;
2107
2108 assert(p_unicode != NULL);
2109 unicode = *p_unicode;
2110 assert(PyUnicode_IS_READY(unicode));
2111 if (PyUnicode_IS_ASCII(unicode))
2112 return;
2113
2114 len = PyUnicode_GET_LENGTH(unicode);
2115 kind = PyUnicode_KIND(unicode);
2116 if (kind == PyUnicode_1BYTE_KIND) {
2117 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002118 max_char = ucs1lib_find_max_char(u, u + len);
2119 if (max_char >= 128)
2120 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002121 }
2122 else if (kind == PyUnicode_2BYTE_KIND) {
2123 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002124 max_char = ucs2lib_find_max_char(u, u + len);
2125 if (max_char >= 256)
2126 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002127 }
2128 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002129 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002130 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002131 max_char = ucs4lib_find_max_char(u, u + len);
2132 if (max_char >= 0x10000)
2133 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002134 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002135 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002136 if (copy != NULL)
2137 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002138 Py_DECREF(unicode);
2139 *p_unicode = copy;
2140}
2141
Victor Stinner034f6cf2011-09-30 02:26:44 +02002142PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002143_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002144{
Victor Stinner87af4f22011-11-21 23:03:47 +01002145 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002146 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002147
Victor Stinner034f6cf2011-09-30 02:26:44 +02002148 if (!PyUnicode_Check(unicode)) {
2149 PyErr_BadInternalCall();
2150 return NULL;
2151 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002152 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002153 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002154
Victor Stinner87af4f22011-11-21 23:03:47 +01002155 length = PyUnicode_GET_LENGTH(unicode);
2156 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002157 if (!copy)
2158 return NULL;
2159 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2160
Victor Stinner87af4f22011-11-21 23:03:47 +01002161 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2162 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002163 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002164 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002165}
2166
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002167
Victor Stinnerbc603d12011-10-02 01:00:40 +02002168/* Widen Unicode objects to larger buffers. Don't write terminating null
2169 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002170
2171void*
2172_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2173{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002174 Py_ssize_t len;
2175 void *result;
2176 unsigned int skind;
2177
Benjamin Petersonbac79492012-01-14 13:34:47 -05002178 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002179 return NULL;
2180
2181 len = PyUnicode_GET_LENGTH(s);
2182 skind = PyUnicode_KIND(s);
2183 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002184 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002185 return NULL;
2186 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002187 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002188 case PyUnicode_2BYTE_KIND:
2189 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2190 if (!result)
2191 return PyErr_NoMemory();
2192 assert(skind == PyUnicode_1BYTE_KIND);
2193 _PyUnicode_CONVERT_BYTES(
2194 Py_UCS1, Py_UCS2,
2195 PyUnicode_1BYTE_DATA(s),
2196 PyUnicode_1BYTE_DATA(s) + len,
2197 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002198 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002199 case PyUnicode_4BYTE_KIND:
2200 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2201 if (!result)
2202 return PyErr_NoMemory();
2203 if (skind == PyUnicode_2BYTE_KIND) {
2204 _PyUnicode_CONVERT_BYTES(
2205 Py_UCS2, Py_UCS4,
2206 PyUnicode_2BYTE_DATA(s),
2207 PyUnicode_2BYTE_DATA(s) + len,
2208 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002209 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002210 else {
2211 assert(skind == PyUnicode_1BYTE_KIND);
2212 _PyUnicode_CONVERT_BYTES(
2213 Py_UCS1, Py_UCS4,
2214 PyUnicode_1BYTE_DATA(s),
2215 PyUnicode_1BYTE_DATA(s) + len,
2216 result);
2217 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002218 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002219 default:
2220 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002221 }
Victor Stinner01698042011-10-04 00:04:26 +02002222 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 return NULL;
2224}
2225
2226static Py_UCS4*
2227as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2228 int copy_null)
2229{
2230 int kind;
2231 void *data;
2232 Py_ssize_t len, targetlen;
2233 if (PyUnicode_READY(string) == -1)
2234 return NULL;
2235 kind = PyUnicode_KIND(string);
2236 data = PyUnicode_DATA(string);
2237 len = PyUnicode_GET_LENGTH(string);
2238 targetlen = len;
2239 if (copy_null)
2240 targetlen++;
2241 if (!target) {
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07002242 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UCS4) < targetlen) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002243 PyErr_NoMemory();
2244 return NULL;
2245 }
2246 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2247 if (!target) {
2248 PyErr_NoMemory();
2249 return NULL;
2250 }
2251 }
2252 else {
2253 if (targetsize < targetlen) {
2254 PyErr_Format(PyExc_SystemError,
2255 "string is longer than the buffer");
2256 if (copy_null && 0 < targetsize)
2257 target[0] = 0;
2258 return NULL;
2259 }
2260 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002261 if (kind == PyUnicode_1BYTE_KIND) {
2262 Py_UCS1 *start = (Py_UCS1 *) data;
2263 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002264 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002265 else if (kind == PyUnicode_2BYTE_KIND) {
2266 Py_UCS2 *start = (Py_UCS2 *) data;
2267 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2268 }
2269 else {
2270 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002272 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002273 if (copy_null)
2274 target[len] = 0;
2275 return target;
2276}
2277
2278Py_UCS4*
2279PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2280 int copy_null)
2281{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002282 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002283 PyErr_BadInternalCall();
2284 return NULL;
2285 }
2286 return as_ucs4(string, target, targetsize, copy_null);
2287}
2288
2289Py_UCS4*
2290PyUnicode_AsUCS4Copy(PyObject *string)
2291{
2292 return as_ucs4(string, NULL, 0, 1);
2293}
2294
2295#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002296
Alexander Belopolsky40018472011-02-26 01:02:56 +00002297PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002298PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002302 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002303 PyErr_BadInternalCall();
2304 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002305 }
2306
Martin v. Löwis790465f2008-04-05 20:41:37 +00002307 if (size == -1) {
2308 size = wcslen(w);
2309 }
2310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002311 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002312}
2313
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002314#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002315
Victor Stinner15a11362012-10-06 23:48:20 +02002316/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002317 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2318 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2319#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002320
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002321static int
2322unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2323 Py_ssize_t width, Py_ssize_t precision)
2324{
2325 Py_ssize_t length, fill, arglen;
2326 Py_UCS4 maxchar;
2327
2328 if (PyUnicode_READY(str) == -1)
2329 return -1;
2330
2331 length = PyUnicode_GET_LENGTH(str);
2332 if ((precision == -1 || precision >= length)
2333 && width <= length)
2334 return _PyUnicodeWriter_WriteStr(writer, str);
2335
2336 if (precision != -1)
2337 length = Py_MIN(precision, length);
2338
2339 arglen = Py_MAX(length, width);
2340 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2341 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2342 else
2343 maxchar = writer->maxchar;
2344
2345 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2346 return -1;
2347
2348 if (width > length) {
2349 fill = width - length;
2350 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2351 return -1;
2352 writer->pos += fill;
2353 }
2354
2355 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2356 str, 0, length);
2357 writer->pos += length;
2358 return 0;
2359}
2360
2361static int
2362unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2363 Py_ssize_t width, Py_ssize_t precision)
2364{
2365 /* UTF-8 */
2366 Py_ssize_t length;
2367 PyObject *unicode;
2368 int res;
2369
2370 length = strlen(str);
2371 if (precision != -1)
2372 length = Py_MIN(length, precision);
2373 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2374 if (unicode == NULL)
2375 return -1;
2376
2377 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2378 Py_DECREF(unicode);
2379 return res;
2380}
2381
Victor Stinner96865452011-03-01 23:44:09 +00002382static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002383unicode_fromformat_arg(_PyUnicodeWriter *writer,
2384 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002385{
Victor Stinnere215d962012-10-06 23:03:36 +02002386 const char *p;
2387 Py_ssize_t len;
2388 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002389 Py_ssize_t width;
2390 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002391 int longflag;
2392 int longlongflag;
2393 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002394 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002395
2396 p = f;
2397 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002398 zeropad = 0;
2399 if (*f == '0') {
2400 zeropad = 1;
2401 f++;
2402 }
Victor Stinner96865452011-03-01 23:44:09 +00002403
2404 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002405 width = -1;
2406 if (Py_ISDIGIT((unsigned)*f)) {
2407 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002408 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002409 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002410 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002411 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002412 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002413 return NULL;
2414 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002415 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002416 f++;
2417 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002418 }
2419 precision = -1;
2420 if (*f == '.') {
2421 f++;
2422 if (Py_ISDIGIT((unsigned)*f)) {
2423 precision = (*f - '0');
2424 f++;
2425 while (Py_ISDIGIT((unsigned)*f)) {
2426 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2427 PyErr_SetString(PyExc_ValueError,
2428 "precision too big");
2429 return NULL;
2430 }
2431 precision = (precision * 10) + (*f - '0');
2432 f++;
2433 }
2434 }
Victor Stinner96865452011-03-01 23:44:09 +00002435 if (*f == '%') {
2436 /* "%.3%s" => f points to "3" */
2437 f--;
2438 }
2439 }
2440 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002441 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002442 f--;
2443 }
Victor Stinner96865452011-03-01 23:44:09 +00002444
2445 /* Handle %ld, %lu, %lld and %llu. */
2446 longflag = 0;
2447 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002448 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002449 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002450 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002451 longflag = 1;
2452 ++f;
2453 }
2454#ifdef HAVE_LONG_LONG
2455 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002456 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002457 longlongflag = 1;
2458 f += 2;
2459 }
2460#endif
2461 }
2462 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002463 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002464 size_tflag = 1;
2465 ++f;
2466 }
Victor Stinnere215d962012-10-06 23:03:36 +02002467
2468 if (f[1] == '\0')
2469 writer->overallocate = 0;
2470
2471 switch (*f) {
2472 case 'c':
2473 {
2474 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002475 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002476 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002477 "character argument not in range(0x110000)");
2478 return NULL;
2479 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002480 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002481 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002482 break;
2483 }
2484
2485 case 'i':
2486 case 'd':
2487 case 'u':
2488 case 'x':
2489 {
2490 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002491 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002492 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002493
2494 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002495 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002496 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002497 va_arg(*vargs, unsigned long));
2498#ifdef HAVE_LONG_LONG
2499 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002500 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002501 va_arg(*vargs, unsigned PY_LONG_LONG));
2502#endif
2503 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002504 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002505 va_arg(*vargs, size_t));
2506 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002507 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002508 va_arg(*vargs, unsigned int));
2509 }
2510 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002511 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002512 }
2513 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002514 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002515 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002516 va_arg(*vargs, long));
2517#ifdef HAVE_LONG_LONG
2518 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002519 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002520 va_arg(*vargs, PY_LONG_LONG));
2521#endif
2522 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002523 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002524 va_arg(*vargs, Py_ssize_t));
2525 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002526 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002527 va_arg(*vargs, int));
2528 }
2529 assert(len >= 0);
2530
Victor Stinnere215d962012-10-06 23:03:36 +02002531 if (precision < len)
2532 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002533
2534 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002535 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2536 return NULL;
2537
Victor Stinnere215d962012-10-06 23:03:36 +02002538 if (width > precision) {
2539 Py_UCS4 fillchar;
2540 fill = width - precision;
2541 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002542 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2543 return NULL;
2544 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002545 }
Victor Stinner15a11362012-10-06 23:48:20 +02002546 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002547 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002548 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2549 return NULL;
2550 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002551 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002552
Victor Stinner4a587072013-11-19 12:54:53 +01002553 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2554 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002555 break;
2556 }
2557
2558 case 'p':
2559 {
2560 char number[MAX_LONG_LONG_CHARS];
2561
2562 len = sprintf(number, "%p", va_arg(*vargs, void*));
2563 assert(len >= 0);
2564
2565 /* %p is ill-defined: ensure leading 0x. */
2566 if (number[1] == 'X')
2567 number[1] = 'x';
2568 else if (number[1] != 'x') {
2569 memmove(number + 2, number,
2570 strlen(number) + 1);
2571 number[0] = '0';
2572 number[1] = 'x';
2573 len += 2;
2574 }
2575
Victor Stinner4a587072013-11-19 12:54:53 +01002576 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002577 return NULL;
2578 break;
2579 }
2580
2581 case 's':
2582 {
2583 /* UTF-8 */
2584 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002585 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002586 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002587 break;
2588 }
2589
2590 case 'U':
2591 {
2592 PyObject *obj = va_arg(*vargs, PyObject *);
2593 assert(obj && _PyUnicode_CHECK(obj));
2594
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002595 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002596 return NULL;
2597 break;
2598 }
2599
2600 case 'V':
2601 {
2602 PyObject *obj = va_arg(*vargs, PyObject *);
2603 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002604 if (obj) {
2605 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002606 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002607 return NULL;
2608 }
2609 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002610 assert(str != NULL);
2611 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002612 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002613 }
2614 break;
2615 }
2616
2617 case 'S':
2618 {
2619 PyObject *obj = va_arg(*vargs, PyObject *);
2620 PyObject *str;
2621 assert(obj);
2622 str = PyObject_Str(obj);
2623 if (!str)
2624 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002625 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002626 Py_DECREF(str);
2627 return NULL;
2628 }
2629 Py_DECREF(str);
2630 break;
2631 }
2632
2633 case 'R':
2634 {
2635 PyObject *obj = va_arg(*vargs, PyObject *);
2636 PyObject *repr;
2637 assert(obj);
2638 repr = PyObject_Repr(obj);
2639 if (!repr)
2640 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002641 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002642 Py_DECREF(repr);
2643 return NULL;
2644 }
2645 Py_DECREF(repr);
2646 break;
2647 }
2648
2649 case 'A':
2650 {
2651 PyObject *obj = va_arg(*vargs, PyObject *);
2652 PyObject *ascii;
2653 assert(obj);
2654 ascii = PyObject_ASCII(obj);
2655 if (!ascii)
2656 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002657 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002658 Py_DECREF(ascii);
2659 return NULL;
2660 }
2661 Py_DECREF(ascii);
2662 break;
2663 }
2664
2665 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002666 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002667 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002668 break;
2669
2670 default:
2671 /* if we stumble upon an unknown formatting code, copy the rest
2672 of the format string to the output string. (we cannot just
2673 skip the code, since there's no way to know what's in the
2674 argument list) */
2675 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002676 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002677 return NULL;
2678 f = p+len;
2679 return f;
2680 }
2681
2682 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002683 return f;
2684}
2685
Walter Dörwaldd2034312007-05-18 16:29:38 +00002686PyObject *
2687PyUnicode_FromFormatV(const char *format, va_list vargs)
2688{
Victor Stinnere215d962012-10-06 23:03:36 +02002689 va_list vargs2;
2690 const char *f;
2691 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002692
Victor Stinner8f674cc2013-04-17 23:02:17 +02002693 _PyUnicodeWriter_Init(&writer);
2694 writer.min_length = strlen(format) + 100;
2695 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002696
2697 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2698 Copy it to be able to pass a reference to a subfunction. */
2699 Py_VA_COPY(vargs2, vargs);
2700
2701 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002702 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002703 f = unicode_fromformat_arg(&writer, f, &vargs2);
2704 if (f == NULL)
2705 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002706 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002707 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002708 const char *p;
2709 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002710
Victor Stinnere215d962012-10-06 23:03:36 +02002711 p = f;
2712 do
2713 {
2714 if ((unsigned char)*p > 127) {
2715 PyErr_Format(PyExc_ValueError,
2716 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2717 "string, got a non-ASCII byte: 0x%02x",
2718 (unsigned char)*p);
2719 return NULL;
2720 }
2721 p++;
2722 }
2723 while (*p != '\0' && *p != '%');
2724 len = p - f;
2725
2726 if (*p == '\0')
2727 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002728
2729 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002730 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002731
2732 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002733 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002734 }
Victor Stinnere215d962012-10-06 23:03:36 +02002735 return _PyUnicodeWriter_Finish(&writer);
2736
2737 fail:
2738 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002739 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002740}
2741
Walter Dörwaldd2034312007-05-18 16:29:38 +00002742PyObject *
2743PyUnicode_FromFormat(const char *format, ...)
2744{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002745 PyObject* ret;
2746 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002747
2748#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002749 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002750#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002751 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002752#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002753 ret = PyUnicode_FromFormatV(format, vargs);
2754 va_end(vargs);
2755 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002756}
2757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002758#ifdef HAVE_WCHAR_H
2759
Victor Stinner5593d8a2010-10-02 11:11:27 +00002760/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2761 convert a Unicode object to a wide character string.
2762
Victor Stinnerd88d9832011-09-06 02:00:05 +02002763 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002764 character) required to convert the unicode object. Ignore size argument.
2765
Victor Stinnerd88d9832011-09-06 02:00:05 +02002766 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002767 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002768 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002769static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002770unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002771 wchar_t *w,
2772 Py_ssize_t size)
2773{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002774 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002775 const wchar_t *wstr;
2776
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002777 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002778 if (wstr == NULL)
2779 return -1;
2780
Victor Stinner5593d8a2010-10-02 11:11:27 +00002781 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002782 if (size > res)
2783 size = res + 1;
2784 else
2785 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002786 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002787 return res;
2788 }
2789 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002790 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002791}
2792
2793Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002794PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002795 wchar_t *w,
2796 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002797{
2798 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002799 PyErr_BadInternalCall();
2800 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002802 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803}
2804
Victor Stinner137c34c2010-09-29 10:25:54 +00002805wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002806PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002807 Py_ssize_t *size)
2808{
2809 wchar_t* buffer;
2810 Py_ssize_t buflen;
2811
2812 if (unicode == NULL) {
2813 PyErr_BadInternalCall();
2814 return NULL;
2815 }
2816
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002817 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002818 if (buflen == -1)
2819 return NULL;
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07002820 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002821 PyErr_NoMemory();
2822 return NULL;
2823 }
2824
Victor Stinner137c34c2010-09-29 10:25:54 +00002825 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2826 if (buffer == NULL) {
2827 PyErr_NoMemory();
2828 return NULL;
2829 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002830 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002831 if (buflen == -1) {
2832 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002833 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002834 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002835 if (size != NULL)
2836 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002837 return buffer;
2838}
2839
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002840#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002841
Alexander Belopolsky40018472011-02-26 01:02:56 +00002842PyObject *
2843PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002844{
Victor Stinner8faf8212011-12-08 22:14:11 +01002845 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002846 PyErr_SetString(PyExc_ValueError,
2847 "chr() arg not in range(0x110000)");
2848 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002849 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002850
Victor Stinner985a82a2014-01-03 12:53:47 +01002851 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002852}
2853
Alexander Belopolsky40018472011-02-26 01:02:56 +00002854PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002855PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002856{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002857 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002858 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002859 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002860 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002861 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002862 Py_INCREF(obj);
2863 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002864 }
2865 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002866 /* For a Unicode subtype that's not a Unicode object,
2867 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002868 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002869 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002870 PyErr_Format(PyExc_TypeError,
2871 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002872 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002873 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002874}
2875
Alexander Belopolsky40018472011-02-26 01:02:56 +00002876PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002877PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002878 const char *encoding,
2879 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002880{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002881 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002882 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002883
Guido van Rossumd57fd912000-03-10 22:53:23 +00002884 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002885 PyErr_BadInternalCall();
2886 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002887 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002888
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002889 /* Decoding bytes objects is the most common case and should be fast */
2890 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002891 if (PyBytes_GET_SIZE(obj) == 0)
2892 _Py_RETURN_UNICODE_EMPTY();
2893 v = PyUnicode_Decode(
2894 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2895 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002896 return v;
2897 }
2898
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002899 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002900 PyErr_SetString(PyExc_TypeError,
2901 "decoding str is not supported");
2902 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002903 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002904
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002905 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2906 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2907 PyErr_Format(PyExc_TypeError,
Serhiy Storchakab757c832014-12-05 22:25:22 +02002908 "coercing to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002909 Py_TYPE(obj)->tp_name);
2910 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002911 }
Tim Petersced69f82003-09-16 20:30:58 +00002912
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002913 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002914 PyBuffer_Release(&buffer);
2915 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002916 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002917
Serhiy Storchaka05997252013-01-26 12:14:02 +02002918 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002919 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002920 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002921}
2922
Victor Stinner600d3be2010-06-10 12:00:55 +00002923/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002924 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2925 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002926int
2927_Py_normalize_encoding(const char *encoding,
2928 char *lower,
2929 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002930{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002931 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002932 char *l;
2933 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002934
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002935 if (encoding == NULL) {
Victor Stinner66b32702013-11-07 23:12:23 +01002936 /* 6 == strlen("utf-8") + 1 */
Victor Stinnerdf23e302013-11-07 13:33:36 +01002937 if (lower_len < 6)
2938 return 0;
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002939 strcpy(lower, "utf-8");
2940 return 1;
2941 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002942 e = encoding;
2943 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002944 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002945 while (*e) {
2946 if (l == l_end)
2947 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002948 if (Py_ISUPPER(*e)) {
2949 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002950 }
2951 else if (*e == '_') {
2952 *l++ = '-';
2953 e++;
2954 }
2955 else {
2956 *l++ = *e++;
2957 }
2958 }
2959 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002960 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002961}
2962
Alexander Belopolsky40018472011-02-26 01:02:56 +00002963PyObject *
2964PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002965 Py_ssize_t size,
2966 const char *encoding,
2967 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002968{
2969 PyObject *buffer = NULL, *unicode;
2970 Py_buffer info;
2971 char lower[11]; /* Enough for any encoding shortcut */
2972
Fred Drakee4315f52000-05-09 19:53:39 +00002973 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002974 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002975 if ((strcmp(lower, "utf-8") == 0) ||
2976 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002977 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002978 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002979 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01002980 (strcmp(lower, "iso-8859-1") == 0) ||
2981 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002982 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002983#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002984 else if (strcmp(lower, "mbcs") == 0)
2985 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002986#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002987 else if (strcmp(lower, "ascii") == 0)
2988 return PyUnicode_DecodeASCII(s, size, errors);
2989 else if (strcmp(lower, "utf-16") == 0)
2990 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2991 else if (strcmp(lower, "utf-32") == 0)
2992 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2993 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002994
2995 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002996 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002997 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002998 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002999 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003000 if (buffer == NULL)
3001 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003002 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003003 if (unicode == NULL)
3004 goto onError;
3005 if (!PyUnicode_Check(unicode)) {
3006 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003007 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3008 "use codecs.decode() to decode to arbitrary types",
3009 encoding,
3010 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003011 Py_DECREF(unicode);
3012 goto onError;
3013 }
3014 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003015 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003016
Benjamin Peterson29060642009-01-31 22:14:21 +00003017 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003018 Py_XDECREF(buffer);
3019 return NULL;
3020}
3021
Alexander Belopolsky40018472011-02-26 01:02:56 +00003022PyObject *
3023PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003024 const char *encoding,
3025 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003026{
3027 PyObject *v;
3028
3029 if (!PyUnicode_Check(unicode)) {
3030 PyErr_BadArgument();
3031 goto onError;
3032 }
3033
3034 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003035 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003036
3037 /* Decode via the codec registry */
3038 v = PyCodec_Decode(unicode, encoding, errors);
3039 if (v == NULL)
3040 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003041 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003042
Benjamin Peterson29060642009-01-31 22:14:21 +00003043 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003044 return NULL;
3045}
3046
Alexander Belopolsky40018472011-02-26 01:02:56 +00003047PyObject *
3048PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003049 const char *encoding,
3050 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003051{
3052 PyObject *v;
3053
3054 if (!PyUnicode_Check(unicode)) {
3055 PyErr_BadArgument();
3056 goto onError;
3057 }
3058
3059 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003060 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003061
3062 /* Decode via the codec registry */
3063 v = PyCodec_Decode(unicode, encoding, errors);
3064 if (v == NULL)
3065 goto onError;
3066 if (!PyUnicode_Check(v)) {
3067 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003068 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3069 "use codecs.decode() to decode to arbitrary types",
3070 encoding,
3071 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003072 Py_DECREF(v);
3073 goto onError;
3074 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003075 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003076
Benjamin Peterson29060642009-01-31 22:14:21 +00003077 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003078 return NULL;
3079}
3080
Alexander Belopolsky40018472011-02-26 01:02:56 +00003081PyObject *
3082PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003083 Py_ssize_t size,
3084 const char *encoding,
3085 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003086{
3087 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003088
Guido van Rossumd57fd912000-03-10 22:53:23 +00003089 unicode = PyUnicode_FromUnicode(s, size);
3090 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003091 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003092 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3093 Py_DECREF(unicode);
3094 return v;
3095}
3096
Alexander Belopolsky40018472011-02-26 01:02:56 +00003097PyObject *
3098PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003099 const char *encoding,
3100 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003101{
3102 PyObject *v;
3103
3104 if (!PyUnicode_Check(unicode)) {
3105 PyErr_BadArgument();
3106 goto onError;
3107 }
3108
3109 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003110 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003111
3112 /* Encode via the codec registry */
3113 v = PyCodec_Encode(unicode, encoding, errors);
3114 if (v == NULL)
3115 goto onError;
3116 return v;
3117
Benjamin Peterson29060642009-01-31 22:14:21 +00003118 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003119 return NULL;
3120}
3121
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003122static size_t
3123wcstombs_errorpos(const wchar_t *wstr)
3124{
3125 size_t len;
3126#if SIZEOF_WCHAR_T == 2
3127 wchar_t buf[3];
3128#else
3129 wchar_t buf[2];
3130#endif
3131 char outbuf[MB_LEN_MAX];
3132 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003133
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003134#if SIZEOF_WCHAR_T == 2
3135 buf[2] = 0;
3136#else
3137 buf[1] = 0;
3138#endif
3139 start = wstr;
3140 while (*wstr != L'\0')
3141 {
3142 previous = wstr;
3143#if SIZEOF_WCHAR_T == 2
3144 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3145 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3146 {
3147 buf[0] = wstr[0];
3148 buf[1] = wstr[1];
3149 wstr += 2;
3150 }
3151 else {
3152 buf[0] = *wstr;
3153 buf[1] = 0;
3154 wstr++;
3155 }
3156#else
3157 buf[0] = *wstr;
3158 wstr++;
3159#endif
3160 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003161 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003162 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003163 }
3164
3165 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003166 return 0;
3167}
3168
Victor Stinner1b579672011-12-17 05:47:23 +01003169static int
3170locale_error_handler(const char *errors, int *surrogateescape)
3171{
3172 if (errors == NULL) {
3173 *surrogateescape = 0;
3174 return 0;
3175 }
3176
3177 if (strcmp(errors, "strict") == 0) {
3178 *surrogateescape = 0;
3179 return 0;
3180 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003181 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003182 *surrogateescape = 1;
3183 return 0;
3184 }
3185 PyErr_Format(PyExc_ValueError,
3186 "only 'strict' and 'surrogateescape' error handlers "
3187 "are supported, not '%s'",
3188 errors);
3189 return -1;
3190}
3191
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003192PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003193PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003194{
3195 Py_ssize_t wlen, wlen2;
3196 wchar_t *wstr;
3197 PyObject *bytes = NULL;
3198 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003199 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003200 PyObject *exc;
3201 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003202 int surrogateescape;
3203
3204 if (locale_error_handler(errors, &surrogateescape) < 0)
3205 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003206
3207 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3208 if (wstr == NULL)
3209 return NULL;
3210
3211 wlen2 = wcslen(wstr);
3212 if (wlen2 != wlen) {
3213 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003214 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003215 return NULL;
3216 }
3217
3218 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003219 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003220 char *str;
3221
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003222 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003223 if (str == NULL) {
3224 if (error_pos == (size_t)-1) {
3225 PyErr_NoMemory();
3226 PyMem_Free(wstr);
3227 return NULL;
3228 }
3229 else {
3230 goto encode_error;
3231 }
3232 }
3233 PyMem_Free(wstr);
3234
3235 bytes = PyBytes_FromString(str);
3236 PyMem_Free(str);
3237 }
3238 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003239 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003240 size_t len, len2;
3241
3242 len = wcstombs(NULL, wstr, 0);
3243 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003244 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003245 goto encode_error;
3246 }
3247
3248 bytes = PyBytes_FromStringAndSize(NULL, len);
3249 if (bytes == NULL) {
3250 PyMem_Free(wstr);
3251 return NULL;
3252 }
3253
3254 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3255 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003256 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003257 goto encode_error;
3258 }
3259 PyMem_Free(wstr);
3260 }
3261 return bytes;
3262
3263encode_error:
3264 errmsg = strerror(errno);
3265 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003266
3267 if (error_pos == (size_t)-1)
3268 error_pos = wcstombs_errorpos(wstr);
3269
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003270 PyMem_Free(wstr);
3271 Py_XDECREF(bytes);
3272
Victor Stinner2f197072011-12-17 07:08:30 +01003273 if (errmsg != NULL) {
3274 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003275 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003276 if (wstr != NULL) {
3277 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003278 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003279 } else
3280 errmsg = NULL;
3281 }
3282 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003283 reason = PyUnicode_FromString(
3284 "wcstombs() encountered an unencodable "
3285 "wide character");
3286 if (reason == NULL)
3287 return NULL;
3288
3289 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3290 "locale", unicode,
3291 (Py_ssize_t)error_pos,
3292 (Py_ssize_t)(error_pos+1),
3293 reason);
3294 Py_DECREF(reason);
3295 if (exc != NULL) {
3296 PyCodec_StrictErrors(exc);
3297 Py_XDECREF(exc);
3298 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003299 return NULL;
3300}
3301
Victor Stinnerad158722010-10-27 00:25:46 +00003302PyObject *
3303PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003304{
Victor Stinner99b95382011-07-04 14:23:54 +02003305#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003306 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003307#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003308 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003309#else
Victor Stinner793b5312011-04-27 00:24:21 +02003310 PyInterpreterState *interp = PyThreadState_GET()->interp;
3311 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3312 cannot use it to encode and decode filenames before it is loaded. Load
3313 the Python codec requires to encode at least its own filename. Use the C
3314 version of the locale codec until the codec registry is initialized and
3315 the Python codec is loaded.
3316
3317 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3318 cannot only rely on it: check also interp->fscodec_initialized for
3319 subinterpreters. */
3320 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003321 return PyUnicode_AsEncodedString(unicode,
3322 Py_FileSystemDefaultEncoding,
3323 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003324 }
3325 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003326 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003327 }
Victor Stinnerad158722010-10-27 00:25:46 +00003328#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003329}
3330
Alexander Belopolsky40018472011-02-26 01:02:56 +00003331PyObject *
3332PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003333 const char *encoding,
3334 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003335{
3336 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003337 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003338
Guido van Rossumd57fd912000-03-10 22:53:23 +00003339 if (!PyUnicode_Check(unicode)) {
3340 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003341 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003342 }
Fred Drakee4315f52000-05-09 19:53:39 +00003343
Fred Drakee4315f52000-05-09 19:53:39 +00003344 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003345 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003346 if ((strcmp(lower, "utf-8") == 0) ||
3347 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003348 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003349 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003350 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003351 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003352 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003353 }
Victor Stinner37296e82010-06-10 13:36:23 +00003354 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003355 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003356 (strcmp(lower, "iso-8859-1") == 0) ||
3357 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003358 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003359#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003360 else if (strcmp(lower, "mbcs") == 0)
3361 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003362#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003363 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003364 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003365 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366
3367 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003368 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003369 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003370 return NULL;
3371
3372 /* The normal path */
3373 if (PyBytes_Check(v))
3374 return v;
3375
3376 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003377 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003378 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003379 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003380
3381 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003382 "encoder %s returned bytearray instead of bytes; "
3383 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003384 encoding);
3385 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003386 Py_DECREF(v);
3387 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003388 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003389
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003390 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3391 Py_DECREF(v);
3392 return b;
3393 }
3394
3395 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003396 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3397 "use codecs.encode() to encode to arbitrary types",
3398 encoding,
3399 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003400 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003401 return NULL;
3402}
3403
Alexander Belopolsky40018472011-02-26 01:02:56 +00003404PyObject *
3405PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003406 const char *encoding,
3407 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003408{
3409 PyObject *v;
3410
3411 if (!PyUnicode_Check(unicode)) {
3412 PyErr_BadArgument();
3413 goto onError;
3414 }
3415
3416 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003417 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003418
3419 /* Encode via the codec registry */
3420 v = PyCodec_Encode(unicode, encoding, errors);
3421 if (v == NULL)
3422 goto onError;
3423 if (!PyUnicode_Check(v)) {
3424 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003425 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3426 "use codecs.encode() to encode to arbitrary types",
3427 encoding,
3428 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003429 Py_DECREF(v);
3430 goto onError;
3431 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003432 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003433
Benjamin Peterson29060642009-01-31 22:14:21 +00003434 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003435 return NULL;
3436}
3437
Victor Stinner2f197072011-12-17 07:08:30 +01003438static size_t
3439mbstowcs_errorpos(const char *str, size_t len)
3440{
3441#ifdef HAVE_MBRTOWC
3442 const char *start = str;
3443 mbstate_t mbs;
3444 size_t converted;
3445 wchar_t ch;
3446
3447 memset(&mbs, 0, sizeof mbs);
3448 while (len)
3449 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003450 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003451 if (converted == 0)
3452 /* Reached end of string */
3453 break;
3454 if (converted == (size_t)-1 || converted == (size_t)-2) {
3455 /* Conversion error or incomplete character */
3456 return str - start;
3457 }
3458 else {
3459 str += converted;
3460 len -= converted;
3461 }
3462 }
3463 /* failed to find the undecodable byte sequence */
3464 return 0;
3465#endif
3466 return 0;
3467}
3468
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003469PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003470PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003471 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003472{
3473 wchar_t smallbuf[256];
3474 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3475 wchar_t *wstr;
3476 size_t wlen, wlen2;
3477 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003478 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003479 size_t error_pos;
3480 char *errmsg;
3481 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003482
3483 if (locale_error_handler(errors, &surrogateescape) < 0)
3484 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003485
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003486 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3487 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003488 return NULL;
3489 }
3490
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003491 if (surrogateescape) {
3492 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003493 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003494 if (wstr == NULL) {
3495 if (wlen == (size_t)-1)
3496 PyErr_NoMemory();
3497 else
3498 PyErr_SetFromErrno(PyExc_OSError);
3499 return NULL;
3500 }
3501
3502 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003503 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003504 }
3505 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003506 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003507#ifndef HAVE_BROKEN_MBSTOWCS
3508 wlen = mbstowcs(NULL, str, 0);
3509#else
3510 wlen = len;
3511#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003512 if (wlen == (size_t)-1)
3513 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003514 if (wlen+1 <= smallbuf_len) {
3515 wstr = smallbuf;
3516 }
3517 else {
3518 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3519 return PyErr_NoMemory();
3520
3521 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3522 if (!wstr)
3523 return PyErr_NoMemory();
3524 }
3525
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003526 wlen2 = mbstowcs(wstr, str, wlen+1);
3527 if (wlen2 == (size_t)-1) {
3528 if (wstr != smallbuf)
3529 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003530 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003531 }
3532#ifdef HAVE_BROKEN_MBSTOWCS
3533 assert(wlen2 == wlen);
3534#endif
3535 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3536 if (wstr != smallbuf)
3537 PyMem_Free(wstr);
3538 }
3539 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003540
3541decode_error:
3542 errmsg = strerror(errno);
3543 assert(errmsg != NULL);
3544
3545 error_pos = mbstowcs_errorpos(str, len);
3546 if (errmsg != NULL) {
3547 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003548 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003549 if (wstr != NULL) {
3550 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003551 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003552 } else
3553 errmsg = NULL;
3554 }
3555 if (errmsg == NULL)
3556 reason = PyUnicode_FromString(
3557 "mbstowcs() encountered an invalid multibyte sequence");
3558 if (reason == NULL)
3559 return NULL;
3560
3561 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3562 "locale", str, len,
3563 (Py_ssize_t)error_pos,
3564 (Py_ssize_t)(error_pos+1),
3565 reason);
3566 Py_DECREF(reason);
3567 if (exc != NULL) {
3568 PyCodec_StrictErrors(exc);
3569 Py_XDECREF(exc);
3570 }
3571 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003572}
3573
3574PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003575PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003576{
3577 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003578 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003579}
3580
3581
3582PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003583PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003584 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003585 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3586}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003587
Christian Heimes5894ba72007-11-04 11:43:14 +00003588PyObject*
3589PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3590{
Victor Stinner99b95382011-07-04 14:23:54 +02003591#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003592 return PyUnicode_DecodeMBCS(s, size, NULL);
3593#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003594 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003595#else
Victor Stinner793b5312011-04-27 00:24:21 +02003596 PyInterpreterState *interp = PyThreadState_GET()->interp;
3597 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3598 cannot use it to encode and decode filenames before it is loaded. Load
3599 the Python codec requires to encode at least its own filename. Use the C
3600 version of the locale codec until the codec registry is initialized and
3601 the Python codec is loaded.
3602
3603 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3604 cannot only rely on it: check also interp->fscodec_initialized for
3605 subinterpreters. */
3606 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003607 return PyUnicode_Decode(s, size,
3608 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003609 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003610 }
3611 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003612 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003613 }
Victor Stinnerad158722010-10-27 00:25:46 +00003614#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003615}
3616
Martin v. Löwis011e8422009-05-05 04:43:17 +00003617
3618int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003619_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003620{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003621 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003622
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003623 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003624 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003625 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3626 PyUnicode_GET_LENGTH(str), '\0', 1);
3627 if (pos == -1)
3628 return 0;
3629 else
3630 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003631}
3632
Antoine Pitrou13348842012-01-29 18:36:34 +01003633int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003634PyUnicode_FSConverter(PyObject* arg, void* addr)
3635{
3636 PyObject *output = NULL;
3637 Py_ssize_t size;
3638 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003639 if (arg == NULL) {
3640 Py_DECREF(*(PyObject**)addr);
3641 return 1;
3642 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003643 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003644 output = arg;
3645 Py_INCREF(output);
3646 }
3647 else {
3648 arg = PyUnicode_FromObject(arg);
3649 if (!arg)
3650 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003651 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003652 Py_DECREF(arg);
3653 if (!output)
3654 return 0;
3655 if (!PyBytes_Check(output)) {
3656 Py_DECREF(output);
3657 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3658 return 0;
3659 }
3660 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003661 size = PyBytes_GET_SIZE(output);
3662 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003663 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003664 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003665 Py_DECREF(output);
3666 return 0;
3667 }
3668 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003669 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003670}
3671
3672
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003673int
3674PyUnicode_FSDecoder(PyObject* arg, void* addr)
3675{
3676 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003677 if (arg == NULL) {
3678 Py_DECREF(*(PyObject**)addr);
3679 return 1;
3680 }
3681 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003682 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003683 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003684 output = arg;
3685 Py_INCREF(output);
3686 }
3687 else {
3688 arg = PyBytes_FromObject(arg);
3689 if (!arg)
3690 return 0;
3691 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3692 PyBytes_GET_SIZE(arg));
3693 Py_DECREF(arg);
3694 if (!output)
3695 return 0;
3696 if (!PyUnicode_Check(output)) {
3697 Py_DECREF(output);
3698 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3699 return 0;
3700 }
3701 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003702 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003703 Py_DECREF(output);
3704 return 0;
3705 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003706 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003707 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003708 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003709 Py_DECREF(output);
3710 return 0;
3711 }
3712 *(PyObject**)addr = output;
3713 return Py_CLEANUP_SUPPORTED;
3714}
3715
3716
Martin v. Löwis5b222132007-06-10 09:51:05 +00003717char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003718PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003719{
Christian Heimesf3863112007-11-22 07:46:41 +00003720 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003721
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003722 if (!PyUnicode_Check(unicode)) {
3723 PyErr_BadArgument();
3724 return NULL;
3725 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003726 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003727 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003728
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003729 if (PyUnicode_UTF8(unicode) == NULL) {
3730 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003731 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3732 if (bytes == NULL)
3733 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003734 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3735 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003736 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003737 Py_DECREF(bytes);
3738 return NULL;
3739 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003740 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3741 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3742 PyBytes_AS_STRING(bytes),
3743 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003744 Py_DECREF(bytes);
3745 }
3746
3747 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003748 *psize = PyUnicode_UTF8_LENGTH(unicode);
3749 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003750}
3751
3752char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003753PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003754{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003755 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3756}
3757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003758Py_UNICODE *
3759PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3760{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003761 const unsigned char *one_byte;
3762#if SIZEOF_WCHAR_T == 4
3763 const Py_UCS2 *two_bytes;
3764#else
3765 const Py_UCS4 *four_bytes;
3766 const Py_UCS4 *ucs4_end;
3767 Py_ssize_t num_surrogates;
3768#endif
3769 wchar_t *w;
3770 wchar_t *wchar_end;
3771
3772 if (!PyUnicode_Check(unicode)) {
3773 PyErr_BadArgument();
3774 return NULL;
3775 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003776 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003777 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003778 assert(_PyUnicode_KIND(unicode) != 0);
3779 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003780
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003781 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003782#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003783 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3784 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003785 num_surrogates = 0;
3786
3787 for (; four_bytes < ucs4_end; ++four_bytes) {
3788 if (*four_bytes > 0xFFFF)
3789 ++num_surrogates;
3790 }
3791
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003792 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3793 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3794 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003795 PyErr_NoMemory();
3796 return NULL;
3797 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003798 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003799
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003800 w = _PyUnicode_WSTR(unicode);
3801 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3802 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003803 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3804 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003805 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003806 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003807 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3808 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003809 }
3810 else
3811 *w = *four_bytes;
3812
3813 if (w > wchar_end) {
3814 assert(0 && "Miscalculated string end");
3815 }
3816 }
3817 *w = 0;
3818#else
3819 /* sizeof(wchar_t) == 4 */
3820 Py_FatalError("Impossible unicode object state, wstr and str "
3821 "should share memory already.");
3822 return NULL;
3823#endif
3824 }
3825 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003826 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3827 (_PyUnicode_LENGTH(unicode) + 1));
3828 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003829 PyErr_NoMemory();
3830 return NULL;
3831 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003832 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3833 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3834 w = _PyUnicode_WSTR(unicode);
3835 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003836
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003837 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3838 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003839 for (; w < wchar_end; ++one_byte, ++w)
3840 *w = *one_byte;
3841 /* null-terminate the wstr */
3842 *w = 0;
3843 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003844 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003845#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003846 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003847 for (; w < wchar_end; ++two_bytes, ++w)
3848 *w = *two_bytes;
3849 /* null-terminate the wstr */
3850 *w = 0;
3851#else
3852 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003853 PyObject_FREE(_PyUnicode_WSTR(unicode));
3854 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003855 Py_FatalError("Impossible unicode object state, wstr "
3856 "and str should share memory already.");
3857 return NULL;
3858#endif
3859 }
3860 else {
3861 assert(0 && "This should never happen.");
3862 }
3863 }
3864 }
3865 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003866 *size = PyUnicode_WSTR_LENGTH(unicode);
3867 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003868}
3869
Alexander Belopolsky40018472011-02-26 01:02:56 +00003870Py_UNICODE *
3871PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003872{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003873 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003874}
3875
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003876
Alexander Belopolsky40018472011-02-26 01:02:56 +00003877Py_ssize_t
3878PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003879{
3880 if (!PyUnicode_Check(unicode)) {
3881 PyErr_BadArgument();
3882 goto onError;
3883 }
3884 return PyUnicode_GET_SIZE(unicode);
3885
Benjamin Peterson29060642009-01-31 22:14:21 +00003886 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003887 return -1;
3888}
3889
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003890Py_ssize_t
3891PyUnicode_GetLength(PyObject *unicode)
3892{
Victor Stinner07621332012-06-16 04:53:46 +02003893 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003894 PyErr_BadArgument();
3895 return -1;
3896 }
Victor Stinner07621332012-06-16 04:53:46 +02003897 if (PyUnicode_READY(unicode) == -1)
3898 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003899 return PyUnicode_GET_LENGTH(unicode);
3900}
3901
3902Py_UCS4
3903PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3904{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003905 void *data;
3906 int kind;
3907
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003908 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3909 PyErr_BadArgument();
3910 return (Py_UCS4)-1;
3911 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003912 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003913 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003914 return (Py_UCS4)-1;
3915 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003916 data = PyUnicode_DATA(unicode);
3917 kind = PyUnicode_KIND(unicode);
3918 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003919}
3920
3921int
3922PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3923{
3924 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003925 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003926 return -1;
3927 }
Victor Stinner488fa492011-12-12 00:01:39 +01003928 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003929 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003930 PyErr_SetString(PyExc_IndexError, "string index out of range");
3931 return -1;
3932 }
Victor Stinner488fa492011-12-12 00:01:39 +01003933 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003934 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003935 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3936 PyErr_SetString(PyExc_ValueError, "character out of range");
3937 return -1;
3938 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003939 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3940 index, ch);
3941 return 0;
3942}
3943
Alexander Belopolsky40018472011-02-26 01:02:56 +00003944const char *
3945PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003946{
Victor Stinner42cb4622010-09-01 19:39:01 +00003947 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003948}
3949
Victor Stinner554f3f02010-06-16 23:33:54 +00003950/* create or adjust a UnicodeDecodeError */
3951static void
3952make_decode_exception(PyObject **exceptionObject,
3953 const char *encoding,
3954 const char *input, Py_ssize_t length,
3955 Py_ssize_t startpos, Py_ssize_t endpos,
3956 const char *reason)
3957{
3958 if (*exceptionObject == NULL) {
3959 *exceptionObject = PyUnicodeDecodeError_Create(
3960 encoding, input, length, startpos, endpos, reason);
3961 }
3962 else {
3963 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3964 goto onError;
3965 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3966 goto onError;
3967 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3968 goto onError;
3969 }
3970 return;
3971
3972onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02003973 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00003974}
3975
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003976#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003977/* error handling callback helper:
3978 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003979 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003980 and adjust various state variables.
3981 return 0 on success, -1 on error
3982*/
3983
Alexander Belopolsky40018472011-02-26 01:02:56 +00003984static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003985unicode_decode_call_errorhandler_wchar(
3986 const char *errors, PyObject **errorHandler,
3987 const char *encoding, const char *reason,
3988 const char **input, const char **inend, Py_ssize_t *startinpos,
3989 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3990 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003991{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003992 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003993
3994 PyObject *restuple = NULL;
3995 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003996 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003997 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003998 Py_ssize_t requiredsize;
3999 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004000 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004001 wchar_t *repwstr;
4002 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004003
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004004 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4005 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004006
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004007 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004008 *errorHandler = PyCodec_LookupError(errors);
4009 if (*errorHandler == NULL)
4010 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004011 }
4012
Victor Stinner554f3f02010-06-16 23:33:54 +00004013 make_decode_exception(exceptionObject,
4014 encoding,
4015 *input, *inend - *input,
4016 *startinpos, *endinpos,
4017 reason);
4018 if (*exceptionObject == NULL)
4019 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004020
4021 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4022 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004023 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004024 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004025 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004026 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004027 }
4028 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004029 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004030
4031 /* Copy back the bytes variables, which might have been modified by the
4032 callback */
4033 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4034 if (!inputobj)
4035 goto onError;
4036 if (!PyBytes_Check(inputobj)) {
4037 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4038 }
4039 *input = PyBytes_AS_STRING(inputobj);
4040 insize = PyBytes_GET_SIZE(inputobj);
4041 *inend = *input + insize;
4042 /* we can DECREF safely, as the exception has another reference,
4043 so the object won't go away. */
4044 Py_DECREF(inputobj);
4045
4046 if (newpos<0)
4047 newpos = insize+newpos;
4048 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004049 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004050 goto onError;
4051 }
4052
4053 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4054 if (repwstr == NULL)
4055 goto onError;
4056 /* need more space? (at least enough for what we
4057 have+the replacement+the rest of the string (starting
4058 at the new input position), so we won't have to check space
4059 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004060 requiredsize = *outpos;
4061 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4062 goto overflow;
4063 requiredsize += repwlen;
4064 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4065 goto overflow;
4066 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004067 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004068 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004069 requiredsize = 2*outsize;
4070 if (unicode_resize(output, requiredsize) < 0)
4071 goto onError;
4072 }
4073 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4074 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004075 *endinpos = newpos;
4076 *inptr = *input + newpos;
4077
4078 /* we made it! */
4079 Py_XDECREF(restuple);
4080 return 0;
4081
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004082 overflow:
4083 PyErr_SetString(PyExc_OverflowError,
4084 "decoded result is too long for a Python string");
4085
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004086 onError:
4087 Py_XDECREF(restuple);
4088 return -1;
4089}
4090#endif /* HAVE_MBCS */
4091
4092static int
4093unicode_decode_call_errorhandler_writer(
4094 const char *errors, PyObject **errorHandler,
4095 const char *encoding, const char *reason,
4096 const char **input, const char **inend, Py_ssize_t *startinpos,
4097 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4098 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4099{
4100 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4101
4102 PyObject *restuple = NULL;
4103 PyObject *repunicode = NULL;
4104 Py_ssize_t insize;
4105 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004106 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004107 PyObject *inputobj = NULL;
4108
4109 if (*errorHandler == NULL) {
4110 *errorHandler = PyCodec_LookupError(errors);
4111 if (*errorHandler == NULL)
4112 goto onError;
4113 }
4114
4115 make_decode_exception(exceptionObject,
4116 encoding,
4117 *input, *inend - *input,
4118 *startinpos, *endinpos,
4119 reason);
4120 if (*exceptionObject == NULL)
4121 goto onError;
4122
4123 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4124 if (restuple == NULL)
4125 goto onError;
4126 if (!PyTuple_Check(restuple)) {
4127 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4128 goto onError;
4129 }
4130 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004131 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004132
4133 /* Copy back the bytes variables, which might have been modified by the
4134 callback */
4135 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4136 if (!inputobj)
4137 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004138 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004139 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004140 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004141 *input = PyBytes_AS_STRING(inputobj);
4142 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004143 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004144 /* we can DECREF safely, as the exception has another reference,
4145 so the object won't go away. */
4146 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004147
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004148 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004149 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004150 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004151 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004152 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004153 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004154
Victor Stinner8f674cc2013-04-17 23:02:17 +02004155 if (PyUnicode_READY(repunicode) < 0)
4156 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004157 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004158 if (replen > 1) {
4159 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004160 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004161 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4162 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4163 goto onError;
4164 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004165 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004166 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004167
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004168 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004169 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004170
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004171 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004172 Py_XDECREF(restuple);
4173 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004174
Benjamin Peterson29060642009-01-31 22:14:21 +00004175 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004176 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004177 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004178}
4179
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004180/* --- UTF-7 Codec -------------------------------------------------------- */
4181
Antoine Pitrou244651a2009-05-04 18:56:13 +00004182/* See RFC2152 for details. We encode conservatively and decode liberally. */
4183
4184/* Three simple macros defining base-64. */
4185
4186/* Is c a base-64 character? */
4187
4188#define IS_BASE64(c) \
4189 (((c) >= 'A' && (c) <= 'Z') || \
4190 ((c) >= 'a' && (c) <= 'z') || \
4191 ((c) >= '0' && (c) <= '9') || \
4192 (c) == '+' || (c) == '/')
4193
4194/* given that c is a base-64 character, what is its base-64 value? */
4195
4196#define FROM_BASE64(c) \
4197 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4198 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4199 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4200 (c) == '+' ? 62 : 63)
4201
4202/* What is the base-64 character of the bottom 6 bits of n? */
4203
4204#define TO_BASE64(n) \
4205 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4206
4207/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4208 * decoded as itself. We are permissive on decoding; the only ASCII
4209 * byte not decoding to itself is the + which begins a base64
4210 * string. */
4211
4212#define DECODE_DIRECT(c) \
4213 ((c) <= 127 && (c) != '+')
4214
4215/* The UTF-7 encoder treats ASCII characters differently according to
4216 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4217 * the above). See RFC2152. This array identifies these different
4218 * sets:
4219 * 0 : "Set D"
4220 * alphanumeric and '(),-./:?
4221 * 1 : "Set O"
4222 * !"#$%&*;<=>@[]^_`{|}
4223 * 2 : "whitespace"
4224 * ht nl cr sp
4225 * 3 : special (must be base64 encoded)
4226 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4227 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004228
Tim Petersced69f82003-09-16 20:30:58 +00004229static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004230char utf7_category[128] = {
4231/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4232 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4233/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4234 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4235/* sp ! " # $ % & ' ( ) * + , - . / */
4236 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4237/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4238 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4239/* @ A B C D E F G H I J K L M N O */
4240 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4241/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4242 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4243/* ` a b c d e f g h i j k l m n o */
4244 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4245/* p q r s t u v w x y z { | } ~ del */
4246 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004247};
4248
Antoine Pitrou244651a2009-05-04 18:56:13 +00004249/* ENCODE_DIRECT: this character should be encoded as itself. The
4250 * answer depends on whether we are encoding set O as itself, and also
4251 * on whether we are encoding whitespace as itself. RFC2152 makes it
4252 * clear that the answers to these questions vary between
4253 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004254
Antoine Pitrou244651a2009-05-04 18:56:13 +00004255#define ENCODE_DIRECT(c, directO, directWS) \
4256 ((c) < 128 && (c) > 0 && \
4257 ((utf7_category[(c)] == 0) || \
4258 (directWS && (utf7_category[(c)] == 2)) || \
4259 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004260
Alexander Belopolsky40018472011-02-26 01:02:56 +00004261PyObject *
4262PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004263 Py_ssize_t size,
4264 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004265{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004266 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4267}
4268
Antoine Pitrou244651a2009-05-04 18:56:13 +00004269/* The decoder. The only state we preserve is our read position,
4270 * i.e. how many characters we have consumed. So if we end in the
4271 * middle of a shift sequence we have to back off the read position
4272 * and the output to the beginning of the sequence, otherwise we lose
4273 * all the shift state (seen bits, number of bits seen, high
4274 * surrogate). */
4275
Alexander Belopolsky40018472011-02-26 01:02:56 +00004276PyObject *
4277PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004278 Py_ssize_t size,
4279 const char *errors,
4280 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004281{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004282 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004283 Py_ssize_t startinpos;
4284 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004285 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004286 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004287 const char *errmsg = "";
4288 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004289 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004290 unsigned int base64bits = 0;
4291 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004292 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004293 PyObject *errorHandler = NULL;
4294 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004295
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004296 if (size == 0) {
4297 if (consumed)
4298 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004299 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004300 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004301
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004302 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004303 _PyUnicodeWriter_Init(&writer);
4304 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004305
4306 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004307 e = s + size;
4308
4309 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004310 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004311 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004312 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004313
Antoine Pitrou244651a2009-05-04 18:56:13 +00004314 if (inShift) { /* in a base-64 section */
4315 if (IS_BASE64(ch)) { /* consume a base-64 character */
4316 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4317 base64bits += 6;
4318 s++;
4319 if (base64bits >= 16) {
4320 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004321 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004322 base64bits -= 16;
4323 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004324 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004325 if (surrogate) {
4326 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004327 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4328 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004329 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004330 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004331 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004332 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004333 }
4334 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004335 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004336 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004337 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004338 }
4339 }
Victor Stinner551ac952011-11-29 22:58:13 +01004340 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004341 /* first surrogate */
4342 surrogate = outCh;
4343 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004344 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004345 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004346 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004347 }
4348 }
4349 }
4350 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004351 inShift = 0;
4352 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004353 if (surrogate) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004354 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004355 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004356 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004357 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004358 if (base64bits > 0) { /* left-over bits */
4359 if (base64bits >= 6) {
4360 /* We've seen at least one base-64 character */
4361 errmsg = "partial character in shift sequence";
4362 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004363 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004364 else {
4365 /* Some bits remain; they should be zero */
4366 if (base64buffer != 0) {
4367 errmsg = "non-zero padding bits in shift sequence";
4368 goto utf7Error;
4369 }
4370 }
4371 }
4372 if (ch != '-') {
4373 /* '-' is absorbed; other terminating
4374 characters are preserved */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004375 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004376 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004377 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004378 }
4379 }
4380 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004381 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004382 s++; /* consume '+' */
4383 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004384 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004385 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004386 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004387 }
4388 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004389 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004390 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004391 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004392 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004393 }
4394 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004395 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004396 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004397 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004398 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004399 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004400 else {
4401 startinpos = s-starts;
4402 s++;
4403 errmsg = "unexpected special character";
4404 goto utf7Error;
4405 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004406 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004407utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004408 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004409 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004410 errors, &errorHandler,
4411 "utf7", errmsg,
4412 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004413 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004414 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004415 }
4416
Antoine Pitrou244651a2009-05-04 18:56:13 +00004417 /* end of string */
4418
4419 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4420 /* if we're in an inconsistent state, that's an error */
4421 if (surrogate ||
4422 (base64bits >= 6) ||
4423 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004424 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004425 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004426 errors, &errorHandler,
4427 "utf7", "unterminated shift sequence",
4428 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004429 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004430 goto onError;
4431 if (s < e)
4432 goto restart;
4433 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004434 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004435
4436 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004437 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004438 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004439 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004440 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004441 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004442 writer.kind, writer.data, shiftOutStart);
4443 Py_XDECREF(errorHandler);
4444 Py_XDECREF(exc);
4445 _PyUnicodeWriter_Dealloc(&writer);
4446 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004447 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004448 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004449 }
4450 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004451 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004452 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004453 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004454
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004455 Py_XDECREF(errorHandler);
4456 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004457 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004458
Benjamin Peterson29060642009-01-31 22:14:21 +00004459 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004460 Py_XDECREF(errorHandler);
4461 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004462 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004463 return NULL;
4464}
4465
4466
Alexander Belopolsky40018472011-02-26 01:02:56 +00004467PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004468_PyUnicode_EncodeUTF7(PyObject *str,
4469 int base64SetO,
4470 int base64WhiteSpace,
4471 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004472{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004473 int kind;
4474 void *data;
4475 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004476 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004477 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004478 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004479 unsigned int base64bits = 0;
4480 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004481 char * out;
4482 char * start;
4483
Benjamin Petersonbac79492012-01-14 13:34:47 -05004484 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004485 return NULL;
4486 kind = PyUnicode_KIND(str);
4487 data = PyUnicode_DATA(str);
4488 len = PyUnicode_GET_LENGTH(str);
4489
4490 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004491 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004492
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004493 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004494 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004495 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004496 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004497 if (v == NULL)
4498 return NULL;
4499
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004500 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004501 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004502 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004503
Antoine Pitrou244651a2009-05-04 18:56:13 +00004504 if (inShift) {
4505 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4506 /* shifting out */
4507 if (base64bits) { /* output remaining bits */
4508 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4509 base64buffer = 0;
4510 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004511 }
4512 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004513 /* Characters not in the BASE64 set implicitly unshift the sequence
4514 so no '-' is required, except if the character is itself a '-' */
4515 if (IS_BASE64(ch) || ch == '-') {
4516 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004517 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004518 *out++ = (char) ch;
4519 }
4520 else {
4521 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004522 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004523 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004524 else { /* not in a shift sequence */
4525 if (ch == '+') {
4526 *out++ = '+';
4527 *out++ = '-';
4528 }
4529 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4530 *out++ = (char) ch;
4531 }
4532 else {
4533 *out++ = '+';
4534 inShift = 1;
4535 goto encode_char;
4536 }
4537 }
4538 continue;
4539encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004540 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004541 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004542
Antoine Pitrou244651a2009-05-04 18:56:13 +00004543 /* code first surrogate */
4544 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004545 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004546 while (base64bits >= 6) {
4547 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4548 base64bits -= 6;
4549 }
4550 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004551 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004552 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004553 base64bits += 16;
4554 base64buffer = (base64buffer << 16) | ch;
4555 while (base64bits >= 6) {
4556 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4557 base64bits -= 6;
4558 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004559 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004560 if (base64bits)
4561 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4562 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004563 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004564 if (_PyBytes_Resize(&v, out - start) < 0)
4565 return NULL;
4566 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004567}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004568PyObject *
4569PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4570 Py_ssize_t size,
4571 int base64SetO,
4572 int base64WhiteSpace,
4573 const char *errors)
4574{
4575 PyObject *result;
4576 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4577 if (tmp == NULL)
4578 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004579 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004580 base64WhiteSpace, errors);
4581 Py_DECREF(tmp);
4582 return result;
4583}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004584
Antoine Pitrou244651a2009-05-04 18:56:13 +00004585#undef IS_BASE64
4586#undef FROM_BASE64
4587#undef TO_BASE64
4588#undef DECODE_DIRECT
4589#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004590
Guido van Rossumd57fd912000-03-10 22:53:23 +00004591/* --- UTF-8 Codec -------------------------------------------------------- */
4592
Alexander Belopolsky40018472011-02-26 01:02:56 +00004593PyObject *
4594PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004595 Py_ssize_t size,
4596 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004597{
Walter Dörwald69652032004-09-07 20:24:22 +00004598 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4599}
4600
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004601#include "stringlib/asciilib.h"
4602#include "stringlib/codecs.h"
4603#include "stringlib/undef.h"
4604
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004605#include "stringlib/ucs1lib.h"
4606#include "stringlib/codecs.h"
4607#include "stringlib/undef.h"
4608
4609#include "stringlib/ucs2lib.h"
4610#include "stringlib/codecs.h"
4611#include "stringlib/undef.h"
4612
4613#include "stringlib/ucs4lib.h"
4614#include "stringlib/codecs.h"
4615#include "stringlib/undef.h"
4616
Antoine Pitrouab868312009-01-10 15:40:25 +00004617/* Mask to quickly check whether a C 'long' contains a
4618 non-ASCII, UTF8-encoded char. */
4619#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004620# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004621#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004622# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004623#else
4624# error C 'long' size should be either 4 or 8!
4625#endif
4626
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004627static Py_ssize_t
4628ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004629{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004630 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004631 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004632
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004633 /*
4634 * Issue #17237: m68k is a bit different from most architectures in
4635 * that objects do not use "natural alignment" - for example, int and
4636 * long are only aligned at 2-byte boundaries. Therefore the assert()
4637 * won't work; also, tests have shown that skipping the "optimised
4638 * version" will even speed up m68k.
4639 */
4640#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004641#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004642 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4643 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004644 /* Fast path, see in STRINGLIB(utf8_decode) for
4645 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004646 /* Help allocation */
4647 const char *_p = p;
4648 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004649 while (_p < aligned_end) {
4650 unsigned long value = *(const unsigned long *) _p;
4651 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004652 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004653 *((unsigned long *)q) = value;
4654 _p += SIZEOF_LONG;
4655 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004656 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004657 p = _p;
4658 while (p < end) {
4659 if ((unsigned char)*p & 0x80)
4660 break;
4661 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004662 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004663 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004664 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004665#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004666#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004667 while (p < end) {
4668 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4669 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004670 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004671 /* Help allocation */
4672 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004673 while (_p < aligned_end) {
4674 unsigned long value = *(unsigned long *) _p;
4675 if (value & ASCII_CHAR_MASK)
4676 break;
4677 _p += SIZEOF_LONG;
4678 }
4679 p = _p;
4680 if (_p == end)
4681 break;
4682 }
4683 if ((unsigned char)*p & 0x80)
4684 break;
4685 ++p;
4686 }
4687 memcpy(dest, start, p - start);
4688 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004689}
Antoine Pitrouab868312009-01-10 15:40:25 +00004690
Victor Stinner785938e2011-12-11 20:09:03 +01004691PyObject *
4692PyUnicode_DecodeUTF8Stateful(const char *s,
4693 Py_ssize_t size,
4694 const char *errors,
4695 Py_ssize_t *consumed)
4696{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004697 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004698 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004699 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004700
4701 Py_ssize_t startinpos;
4702 Py_ssize_t endinpos;
4703 const char *errmsg = "";
4704 PyObject *errorHandler = NULL;
4705 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004706
4707 if (size == 0) {
4708 if (consumed)
4709 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004710 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004711 }
4712
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004713 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4714 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004715 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004716 *consumed = 1;
4717 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004718 }
4719
Victor Stinner8f674cc2013-04-17 23:02:17 +02004720 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004721 writer.min_length = size;
4722 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004723 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004724
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004725 writer.pos = ascii_decode(s, end, writer.data);
4726 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004727 while (s < end) {
4728 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004729 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004730 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004731 if (PyUnicode_IS_ASCII(writer.buffer))
4732 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004733 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004734 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004735 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004736 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004737 } else {
4738 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004739 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004740 }
4741
4742 switch (ch) {
4743 case 0:
4744 if (s == end || consumed)
4745 goto End;
4746 errmsg = "unexpected end of data";
4747 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004748 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004749 break;
4750 case 1:
4751 errmsg = "invalid start byte";
4752 startinpos = s - starts;
4753 endinpos = startinpos + 1;
4754 break;
4755 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004756 case 3:
4757 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004758 errmsg = "invalid continuation byte";
4759 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004760 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004761 break;
4762 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004763 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004764 goto onError;
4765 continue;
4766 }
4767
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004768 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004769 errors, &errorHandler,
4770 "utf-8", errmsg,
4771 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004772 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004773 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004774 }
4775
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004776End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004777 if (consumed)
4778 *consumed = s - starts;
4779
4780 Py_XDECREF(errorHandler);
4781 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004782 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004783
4784onError:
4785 Py_XDECREF(errorHandler);
4786 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004787 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004788 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004789}
4790
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004791#ifdef __APPLE__
4792
4793/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004794 used to decode the command line arguments on Mac OS X.
4795
4796 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004797 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004798
4799wchar_t*
4800_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4801{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004802 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004803 wchar_t *unicode;
4804 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004805
4806 /* Note: size will always be longer than the resulting Unicode
4807 character count */
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004808 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004809 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004810 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004811 if (!unicode)
4812 return NULL;
4813
4814 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004815 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004816 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004817 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004818 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004819#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004820 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004821#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004822 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004823#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004824 if (ch > 0xFF) {
4825#if SIZEOF_WCHAR_T == 4
4826 assert(0);
4827#else
4828 assert(Py_UNICODE_IS_SURROGATE(ch));
4829 /* compute and append the two surrogates: */
4830 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4831 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4832#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004833 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004834 else {
4835 if (!ch && s == e)
4836 break;
4837 /* surrogateescape */
4838 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4839 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004840 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004841 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004842 return unicode;
4843}
4844
4845#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004846
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004847/* Primary internal function which creates utf8 encoded bytes objects.
4848
4849 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004850 and allocate exactly as much space needed at the end. Else allocate the
4851 maximum possible needed (4 result bytes per Unicode character), and return
4852 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004853*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004854PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004855_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856{
Victor Stinner6099a032011-12-18 14:22:26 +01004857 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004858 void *data;
4859 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004860
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004861 if (!PyUnicode_Check(unicode)) {
4862 PyErr_BadArgument();
4863 return NULL;
4864 }
4865
4866 if (PyUnicode_READY(unicode) == -1)
4867 return NULL;
4868
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004869 if (PyUnicode_UTF8(unicode))
4870 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4871 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004872
4873 kind = PyUnicode_KIND(unicode);
4874 data = PyUnicode_DATA(unicode);
4875 size = PyUnicode_GET_LENGTH(unicode);
4876
Benjamin Petersonead6b532011-12-20 17:23:42 -06004877 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004878 default:
4879 assert(0);
4880 case PyUnicode_1BYTE_KIND:
4881 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4882 assert(!PyUnicode_IS_ASCII(unicode));
4883 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4884 case PyUnicode_2BYTE_KIND:
4885 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4886 case PyUnicode_4BYTE_KIND:
4887 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004888 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004889}
4890
Alexander Belopolsky40018472011-02-26 01:02:56 +00004891PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004892PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4893 Py_ssize_t size,
4894 const char *errors)
4895{
4896 PyObject *v, *unicode;
4897
4898 unicode = PyUnicode_FromUnicode(s, size);
4899 if (unicode == NULL)
4900 return NULL;
4901 v = _PyUnicode_AsUTF8String(unicode, errors);
4902 Py_DECREF(unicode);
4903 return v;
4904}
4905
4906PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004907PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004909 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910}
4911
Walter Dörwald41980ca2007-08-16 21:55:45 +00004912/* --- UTF-32 Codec ------------------------------------------------------- */
4913
4914PyObject *
4915PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004916 Py_ssize_t size,
4917 const char *errors,
4918 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004919{
4920 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4921}
4922
4923PyObject *
4924PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004925 Py_ssize_t size,
4926 const char *errors,
4927 int *byteorder,
4928 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004929{
4930 const char *starts = s;
4931 Py_ssize_t startinpos;
4932 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004933 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004934 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004935 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004936 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004937 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004938 PyObject *errorHandler = NULL;
4939 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004940
Walter Dörwald41980ca2007-08-16 21:55:45 +00004941 q = (unsigned char *)s;
4942 e = q + size;
4943
4944 if (byteorder)
4945 bo = *byteorder;
4946
4947 /* Check for BOM marks (U+FEFF) in the input and adjust current
4948 byte order setting accordingly. In native mode, the leading BOM
4949 mark is skipped, in all other modes, it is copied to the output
4950 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004951 if (bo == 0 && size >= 4) {
4952 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4953 if (bom == 0x0000FEFF) {
4954 bo = -1;
4955 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004956 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004957 else if (bom == 0xFFFE0000) {
4958 bo = 1;
4959 q += 4;
4960 }
4961 if (byteorder)
4962 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004963 }
4964
Victor Stinnere64322e2012-10-30 23:12:47 +01004965 if (q == e) {
4966 if (consumed)
4967 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004968 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004969 }
4970
Victor Stinnere64322e2012-10-30 23:12:47 +01004971#ifdef WORDS_BIGENDIAN
4972 le = bo < 0;
4973#else
4974 le = bo <= 0;
4975#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004976 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01004977
Victor Stinner8f674cc2013-04-17 23:02:17 +02004978 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004979 writer.min_length = (e - q + 3) / 4;
4980 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004981 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01004982
Victor Stinnere64322e2012-10-30 23:12:47 +01004983 while (1) {
4984 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004985 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004986
Victor Stinnere64322e2012-10-30 23:12:47 +01004987 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004988 enum PyUnicode_Kind kind = writer.kind;
4989 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01004990 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004991 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004992 if (le) {
4993 do {
4994 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4995 if (ch > maxch)
4996 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004997 if (kind != PyUnicode_1BYTE_KIND &&
4998 Py_UNICODE_IS_SURROGATE(ch))
4999 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005000 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005001 q += 4;
5002 } while (q <= last);
5003 }
5004 else {
5005 do {
5006 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5007 if (ch > maxch)
5008 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005009 if (kind != PyUnicode_1BYTE_KIND &&
5010 Py_UNICODE_IS_SURROGATE(ch))
5011 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005012 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005013 q += 4;
5014 } while (q <= last);
5015 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005016 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005017 }
5018
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005019 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005020 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005021 startinpos = ((const char *)q) - starts;
5022 endinpos = startinpos + 4;
5023 }
5024 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005025 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005026 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005027 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005028 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005029 startinpos = ((const char *)q) - starts;
5030 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005031 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005032 else {
5033 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005034 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005035 goto onError;
5036 q += 4;
5037 continue;
5038 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005039 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005040 startinpos = ((const char *)q) - starts;
5041 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005042 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005043
5044 /* The remaining input chars are ignored if the callback
5045 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005046 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005047 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005048 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005049 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005050 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005051 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005052 }
5053
Walter Dörwald41980ca2007-08-16 21:55:45 +00005054 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005055 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005056
Walter Dörwald41980ca2007-08-16 21:55:45 +00005057 Py_XDECREF(errorHandler);
5058 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005059 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005060
Benjamin Peterson29060642009-01-31 22:14:21 +00005061 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005062 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005063 Py_XDECREF(errorHandler);
5064 Py_XDECREF(exc);
5065 return NULL;
5066}
5067
5068PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005069_PyUnicode_EncodeUTF32(PyObject *str,
5070 const char *errors,
5071 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005072{
Serhiy Storchaka30793282014-01-04 22:44:01 +02005073 int kind;
5074 void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005075 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005076 PyObject *v;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005077 unsigned char *p;
5078 Py_ssize_t nsize, i;
5079 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02005080#if PY_LITTLE_ENDIAN
Serhiy Storchaka30793282014-01-04 22:44:01 +02005081 int iorder[] = {0, 1, 2, 3};
Walter Dörwald41980ca2007-08-16 21:55:45 +00005082#else
Serhiy Storchaka30793282014-01-04 22:44:01 +02005083 int iorder[] = {3, 2, 1, 0};
Walter Dörwald41980ca2007-08-16 21:55:45 +00005084#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005085 const char *encoding;
5086 PyObject *errorHandler = NULL;
5087 PyObject *exc = NULL;
5088 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005089
Serhiy Storchaka30793282014-01-04 22:44:01 +02005090#define STORECHAR(CH) \
5091 do { \
5092 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5093 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5094 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5095 p[iorder[0]] = (CH) & 0xff; \
5096 p += 4; \
5097 } while(0)
5098
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005099 if (!PyUnicode_Check(str)) {
5100 PyErr_BadArgument();
5101 return NULL;
5102 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005103 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005104 return NULL;
5105 kind = PyUnicode_KIND(str);
5106 data = PyUnicode_DATA(str);
5107 len = PyUnicode_GET_LENGTH(str);
5108
Serhiy Storchaka583a9392014-01-04 19:25:37 +02005109 nsize = len + (byteorder == 0);
Serhiy Storchaka30793282014-01-04 22:44:01 +02005110 if (nsize > PY_SSIZE_T_MAX / 4)
5111 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005112 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005113 if (v == NULL)
5114 return NULL;
5115
Serhiy Storchaka30793282014-01-04 22:44:01 +02005116 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005117 if (byteorder == 0)
Serhiy Storchaka30793282014-01-04 22:44:01 +02005118 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005119 if (len == 0)
Serhiy Storchaka30793282014-01-04 22:44:01 +02005120 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005121
Serhiy Storchaka30793282014-01-04 22:44:01 +02005122 if (byteorder == -1) {
5123 /* force LE */
5124 iorder[0] = 0;
5125 iorder[1] = 1;
5126 iorder[2] = 2;
5127 iorder[3] = 3;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005128 encoding = "utf-32-le";
Serhiy Storchaka30793282014-01-04 22:44:01 +02005129 }
5130 else if (byteorder == 1) {
5131 /* force BE */
5132 iorder[0] = 3;
5133 iorder[1] = 2;
5134 iorder[2] = 1;
5135 iorder[3] = 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005136 encoding = "utf-32-be";
Serhiy Storchaka30793282014-01-04 22:44:01 +02005137 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005138 else
5139 encoding = "utf-32";
5140
5141 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005142 for (i = 0; i < len; i++)
5143 STORECHAR(PyUnicode_READ(kind, data, i));
5144 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005145 }
5146
Serhiy Storchaka30793282014-01-04 22:44:01 +02005147 for (i = 0; i < len;) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005148 Py_ssize_t repsize, moreunits;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005149 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5150 i++;
5151 assert(ch <= MAX_UNICODE);
5152 if (!Py_UNICODE_IS_SURROGATE(ch)) {
5153 STORECHAR(ch);
5154 continue;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005155 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005156
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005157 rep = unicode_encode_call_errorhandler(
5158 errors, &errorHandler,
5159 encoding, "surrogates not allowed",
Serhiy Storchaka30793282014-01-04 22:44:01 +02005160 str, &exc, i-1, i, &i);
5161
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005162 if (!rep)
5163 goto error;
5164
5165 if (PyBytes_Check(rep)) {
5166 repsize = PyBytes_GET_SIZE(rep);
5167 if (repsize & 3) {
5168 raise_encode_exception(&exc, encoding,
Serhiy Storchaka30793282014-01-04 22:44:01 +02005169 str, i - 1, i,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005170 "surrogates not allowed");
5171 goto error;
5172 }
5173 moreunits = repsize / 4;
5174 }
5175 else {
5176 assert(PyUnicode_Check(rep));
5177 if (PyUnicode_READY(rep) < 0)
5178 goto error;
5179 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5180 if (!PyUnicode_IS_ASCII(rep)) {
5181 raise_encode_exception(&exc, encoding,
Serhiy Storchaka30793282014-01-04 22:44:01 +02005182 str, i - 1, i,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005183 "surrogates not allowed");
5184 goto error;
5185 }
5186 }
5187
5188 /* four bytes are reserved for each surrogate */
5189 if (moreunits > 1) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005190 Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005191 Py_ssize_t morebytes = 4 * (moreunits - 1);
5192 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5193 /* integer overflow */
5194 PyErr_NoMemory();
5195 goto error;
5196 }
5197 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5198 goto error;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005199 p = (unsigned char*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005200 }
5201
5202 if (PyBytes_Check(rep)) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005203 Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize);
5204 p += repsize;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005205 } else /* rep is unicode */ {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005206 const Py_UCS1 *repdata;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005207 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka30793282014-01-04 22:44:01 +02005208 repdata = PyUnicode_1BYTE_DATA(rep);
5209 while (repsize--) {
5210 Py_UCS4 ch = *repdata++;
5211 STORECHAR(ch);
5212 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005213 }
5214
5215 Py_CLEAR(rep);
5216 }
5217
5218 /* Cut back to size actually needed. This is necessary for, for example,
5219 encoding of a string containing isolated surrogates and the 'ignore'
5220 handler is used. */
Serhiy Storchaka30793282014-01-04 22:44:01 +02005221 nsize = p - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005222 if (nsize != PyBytes_GET_SIZE(v))
5223 _PyBytes_Resize(&v, nsize);
5224 Py_XDECREF(errorHandler);
5225 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005226 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005227 error:
5228 Py_XDECREF(rep);
5229 Py_XDECREF(errorHandler);
5230 Py_XDECREF(exc);
5231 Py_XDECREF(v);
5232 return NULL;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005233#undef STORECHAR
Walter Dörwald41980ca2007-08-16 21:55:45 +00005234}
5235
Alexander Belopolsky40018472011-02-26 01:02:56 +00005236PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005237PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5238 Py_ssize_t size,
5239 const char *errors,
5240 int byteorder)
5241{
5242 PyObject *result;
5243 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5244 if (tmp == NULL)
5245 return NULL;
5246 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5247 Py_DECREF(tmp);
5248 return result;
5249}
5250
5251PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005252PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005253{
Victor Stinnerb960b342011-11-20 19:12:52 +01005254 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005255}
5256
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257/* --- UTF-16 Codec ------------------------------------------------------- */
5258
Tim Peters772747b2001-08-09 22:21:55 +00005259PyObject *
5260PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005261 Py_ssize_t size,
5262 const char *errors,
5263 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264{
Walter Dörwald69652032004-09-07 20:24:22 +00005265 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5266}
5267
5268PyObject *
5269PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005270 Py_ssize_t size,
5271 const char *errors,
5272 int *byteorder,
5273 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005274{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005275 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005276 Py_ssize_t startinpos;
5277 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005278 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005279 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005280 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005281 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005282 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005283 PyObject *errorHandler = NULL;
5284 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005285 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286
Tim Peters772747b2001-08-09 22:21:55 +00005287 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005288 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289
5290 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005291 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005293 /* Check for BOM marks (U+FEFF) in the input and adjust current
5294 byte order setting accordingly. In native mode, the leading BOM
5295 mark is skipped, in all other modes, it is copied to the output
5296 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005297 if (bo == 0 && size >= 2) {
5298 const Py_UCS4 bom = (q[1] << 8) | q[0];
5299 if (bom == 0xFEFF) {
5300 q += 2;
5301 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005302 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005303 else if (bom == 0xFFFE) {
5304 q += 2;
5305 bo = 1;
5306 }
5307 if (byteorder)
5308 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005309 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310
Antoine Pitrou63065d72012-05-15 23:48:04 +02005311 if (q == e) {
5312 if (consumed)
5313 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005314 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005315 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005316
Christian Heimes743e0cd2012-10-17 23:52:17 +02005317#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005318 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005319 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005320#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005321 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005322 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005323#endif
Tim Peters772747b2001-08-09 22:21:55 +00005324
Antoine Pitrou63065d72012-05-15 23:48:04 +02005325 /* Note: size will always be longer than the resulting Unicode
5326 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005327 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005328 writer.min_length = (e - q + 1) / 2;
5329 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005330 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005331
Antoine Pitrou63065d72012-05-15 23:48:04 +02005332 while (1) {
5333 Py_UCS4 ch = 0;
5334 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005335 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005336 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005337 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005338 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005339 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005340 native_ordering);
5341 else
5342 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005343 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005344 native_ordering);
5345 } else if (kind == PyUnicode_2BYTE_KIND) {
5346 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005347 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005348 native_ordering);
5349 } else {
5350 assert(kind == PyUnicode_4BYTE_KIND);
5351 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005352 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005353 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005354 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005355 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005356
Antoine Pitrou63065d72012-05-15 23:48:04 +02005357 switch (ch)
5358 {
5359 case 0:
5360 /* remaining byte at the end? (size should be even) */
5361 if (q == e || consumed)
5362 goto End;
5363 errmsg = "truncated data";
5364 startinpos = ((const char *)q) - starts;
5365 endinpos = ((const char *)e) - starts;
5366 break;
5367 /* The remaining input chars are ignored if the callback
5368 chooses to skip the input */
5369 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005370 q -= 2;
5371 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005372 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005373 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005374 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005375 endinpos = ((const char *)e) - starts;
5376 break;
5377 case 2:
5378 errmsg = "illegal encoding";
5379 startinpos = ((const char *)q) - 2 - starts;
5380 endinpos = startinpos + 2;
5381 break;
5382 case 3:
5383 errmsg = "illegal UTF-16 surrogate";
5384 startinpos = ((const char *)q) - 4 - starts;
5385 endinpos = startinpos + 2;
5386 break;
5387 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005388 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005389 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005390 continue;
5391 }
5392
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005393 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005394 errors,
5395 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005396 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005397 &starts,
5398 (const char **)&e,
5399 &startinpos,
5400 &endinpos,
5401 &exc,
5402 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005403 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005404 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405 }
5406
Antoine Pitrou63065d72012-05-15 23:48:04 +02005407End:
Walter Dörwald69652032004-09-07 20:24:22 +00005408 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005409 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005410
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005411 Py_XDECREF(errorHandler);
5412 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005413 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414
Benjamin Peterson29060642009-01-31 22:14:21 +00005415 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005416 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005417 Py_XDECREF(errorHandler);
5418 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419 return NULL;
5420}
5421
Tim Peters772747b2001-08-09 22:21:55 +00005422PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005423_PyUnicode_EncodeUTF16(PyObject *str,
5424 const char *errors,
5425 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005427 enum PyUnicode_Kind kind;
5428 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005429 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005430 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005431 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005432 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005433#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005434 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005435#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005436 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005437#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005438 const char *encoding;
5439 Py_ssize_t nsize, pos;
5440 PyObject *errorHandler = NULL;
5441 PyObject *exc = NULL;
5442 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005443
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005444 if (!PyUnicode_Check(str)) {
5445 PyErr_BadArgument();
5446 return NULL;
5447 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005448 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005449 return NULL;
5450 kind = PyUnicode_KIND(str);
5451 data = PyUnicode_DATA(str);
5452 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005453
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005454 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005455 if (kind == PyUnicode_4BYTE_KIND) {
5456 const Py_UCS4 *in = (const Py_UCS4 *)data;
5457 const Py_UCS4 *end = in + len;
5458 while (in < end)
5459 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005460 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005461 }
5462 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005463 return PyErr_NoMemory();
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005464 nsize = len + pairs + (byteorder == 0);
5465 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466 if (v == NULL)
5467 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005469 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005470 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005471 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005473 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005474 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005475 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005476
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005477 if (kind == PyUnicode_1BYTE_KIND) {
5478 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5479 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005480 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005481
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005482 if (byteorder < 0)
5483 encoding = "utf-16-le";
5484 else if (byteorder > 0)
5485 encoding = "utf-16-be";
5486 else
5487 encoding = "utf-16";
5488
5489 pos = 0;
5490 while (pos < len) {
5491 Py_ssize_t repsize, moreunits;
5492
5493 if (kind == PyUnicode_2BYTE_KIND) {
5494 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5495 &out, native_ordering);
5496 }
5497 else {
5498 assert(kind == PyUnicode_4BYTE_KIND);
5499 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5500 &out, native_ordering);
5501 }
5502 if (pos == len)
5503 break;
5504
5505 rep = unicode_encode_call_errorhandler(
5506 errors, &errorHandler,
5507 encoding, "surrogates not allowed",
5508 str, &exc, pos, pos + 1, &pos);
5509 if (!rep)
5510 goto error;
5511
5512 if (PyBytes_Check(rep)) {
5513 repsize = PyBytes_GET_SIZE(rep);
5514 if (repsize & 1) {
5515 raise_encode_exception(&exc, encoding,
5516 str, pos - 1, pos,
5517 "surrogates not allowed");
5518 goto error;
5519 }
5520 moreunits = repsize / 2;
5521 }
5522 else {
5523 assert(PyUnicode_Check(rep));
5524 if (PyUnicode_READY(rep) < 0)
5525 goto error;
5526 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5527 if (!PyUnicode_IS_ASCII(rep)) {
5528 raise_encode_exception(&exc, encoding,
5529 str, pos - 1, pos,
5530 "surrogates not allowed");
5531 goto error;
5532 }
5533 }
5534
5535 /* two bytes are reserved for each surrogate */
5536 if (moreunits > 1) {
5537 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5538 Py_ssize_t morebytes = 2 * (moreunits - 1);
5539 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5540 /* integer overflow */
5541 PyErr_NoMemory();
5542 goto error;
5543 }
5544 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5545 goto error;
5546 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5547 }
5548
5549 if (PyBytes_Check(rep)) {
5550 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5551 out += moreunits;
5552 } else /* rep is unicode */ {
5553 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5554 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5555 &out, native_ordering);
5556 }
5557
5558 Py_CLEAR(rep);
5559 }
5560
5561 /* Cut back to size actually needed. This is necessary for, for example,
5562 encoding of a string containing isolated surrogates and the 'ignore' handler
5563 is used. */
5564 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5565 if (nsize != PyBytes_GET_SIZE(v))
5566 _PyBytes_Resize(&v, nsize);
5567 Py_XDECREF(errorHandler);
5568 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005569 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005570 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005571 error:
5572 Py_XDECREF(rep);
5573 Py_XDECREF(errorHandler);
5574 Py_XDECREF(exc);
5575 Py_XDECREF(v);
5576 return NULL;
5577#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578}
5579
Alexander Belopolsky40018472011-02-26 01:02:56 +00005580PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005581PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5582 Py_ssize_t size,
5583 const char *errors,
5584 int byteorder)
5585{
5586 PyObject *result;
5587 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5588 if (tmp == NULL)
5589 return NULL;
5590 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5591 Py_DECREF(tmp);
5592 return result;
5593}
5594
5595PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005596PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005598 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599}
5600
5601/* --- Unicode Escape Codec ----------------------------------------------- */
5602
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005603/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5604 if all the escapes in the string make it still a valid ASCII string.
5605 Returns -1 if any escapes were found which cause the string to
5606 pop out of ASCII range. Otherwise returns the length of the
5607 required buffer to hold the string.
5608 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005609static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005610length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5611{
5612 const unsigned char *p = (const unsigned char *)s;
5613 const unsigned char *end = p + size;
5614 Py_ssize_t length = 0;
5615
5616 if (size < 0)
5617 return -1;
5618
5619 for (; p < end; ++p) {
5620 if (*p > 127) {
5621 /* Non-ASCII */
5622 return -1;
5623 }
5624 else if (*p != '\\') {
5625 /* Normal character */
5626 ++length;
5627 }
5628 else {
5629 /* Backslash-escape, check next char */
5630 ++p;
5631 /* Escape sequence reaches till end of string or
5632 non-ASCII follow-up. */
5633 if (p >= end || *p > 127)
5634 return -1;
5635 switch (*p) {
5636 case '\n':
5637 /* backslash + \n result in zero characters */
5638 break;
5639 case '\\': case '\'': case '\"':
5640 case 'b': case 'f': case 't':
5641 case 'n': case 'r': case 'v': case 'a':
5642 ++length;
5643 break;
5644 case '0': case '1': case '2': case '3':
5645 case '4': case '5': case '6': case '7':
5646 case 'x': case 'u': case 'U': case 'N':
5647 /* these do not guarantee ASCII characters */
5648 return -1;
5649 default:
5650 /* count the backslash + the other character */
5651 length += 2;
5652 }
5653 }
5654 }
5655 return length;
5656}
5657
Fredrik Lundh06d12682001-01-24 07:59:11 +00005658static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005659
Alexander Belopolsky40018472011-02-26 01:02:56 +00005660PyObject *
5661PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005662 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005663 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005665 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005666 Py_ssize_t startinpos;
5667 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005668 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005670 char* message;
5671 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005672 PyObject *errorHandler = NULL;
5673 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005674 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005675
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005676 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005677 if (len == 0)
5678 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005679
5680 /* After length_of_escaped_ascii_string() there are two alternatives,
5681 either the string is pure ASCII with named escapes like \n, etc.
5682 and we determined it's exact size (common case)
5683 or it contains \x, \u, ... escape sequences. then we create a
5684 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005685 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005686 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005687 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005688 }
5689 else {
5690 /* Escaped strings will always be longer than the resulting
5691 Unicode string, so we start with size here and then reduce the
5692 length after conversion to the true value.
5693 (but if the error callback returns a long replacement string
5694 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005695 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005696 }
5697
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005699 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005701
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702 while (s < end) {
5703 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005704 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005705 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706
5707 /* Non-escape characters are interpreted as Unicode ordinals */
5708 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005709 x = (unsigned char)*s;
5710 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005711 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005712 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713 continue;
5714 }
5715
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005716 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717 /* \ - Escapes */
5718 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005719 c = *s++;
5720 if (s > end)
5721 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005722
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005723 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724
Benjamin Peterson29060642009-01-31 22:14:21 +00005725 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005726#define WRITECHAR(ch) \
5727 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005728 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005729 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005730 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005731
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005733 case '\\': WRITECHAR('\\'); break;
5734 case '\'': WRITECHAR('\''); break;
5735 case '\"': WRITECHAR('\"'); break;
5736 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005737 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005738 case 'f': WRITECHAR('\014'); break;
5739 case 't': WRITECHAR('\t'); break;
5740 case 'n': WRITECHAR('\n'); break;
5741 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005742 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005743 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005744 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005745 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746
Benjamin Peterson29060642009-01-31 22:14:21 +00005747 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748 case '0': case '1': case '2': case '3':
5749 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005750 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005751 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005752 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005753 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005754 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005756 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757 break;
5758
Benjamin Peterson29060642009-01-31 22:14:21 +00005759 /* hex escapes */
5760 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005762 digits = 2;
5763 message = "truncated \\xXX escape";
5764 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765
Benjamin Peterson29060642009-01-31 22:14:21 +00005766 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005768 digits = 4;
5769 message = "truncated \\uXXXX escape";
5770 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005771
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005773 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005774 digits = 8;
5775 message = "truncated \\UXXXXXXXX escape";
5776 hexescape:
5777 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005778 if (end - s < digits) {
5779 /* count only hex digits */
5780 for (; s < end; ++s) {
5781 c = (unsigned char)*s;
5782 if (!Py_ISXDIGIT(c))
5783 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005784 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005785 goto error;
5786 }
5787 for (; digits--; ++s) {
5788 c = (unsigned char)*s;
5789 if (!Py_ISXDIGIT(c))
5790 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005791 chr = (chr<<4) & ~0xF;
5792 if (c >= '0' && c <= '9')
5793 chr += c - '0';
5794 else if (c >= 'a' && c <= 'f')
5795 chr += 10 + c - 'a';
5796 else
5797 chr += 10 + c - 'A';
5798 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005799 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005800 /* _decoding_error will have already written into the
5801 target buffer. */
5802 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005803 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005804 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005805 message = "illegal Unicode character";
5806 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005807 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005808 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005809 break;
5810
Benjamin Peterson29060642009-01-31 22:14:21 +00005811 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005812 case 'N':
5813 message = "malformed \\N character escape";
5814 if (ucnhash_CAPI == NULL) {
5815 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005816 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5817 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005818 if (ucnhash_CAPI == NULL)
5819 goto ucnhashError;
5820 }
5821 if (*s == '{') {
5822 const char *start = s+1;
5823 /* look for the closing brace */
5824 while (*s != '}' && s < end)
5825 s++;
5826 if (s > start && s < end && *s == '}') {
5827 /* found a name. look it up in the unicode database */
5828 message = "unknown Unicode character name";
5829 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005830 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005831 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005832 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005833 goto store;
5834 }
5835 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005836 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005837
5838 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005839 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005840 message = "\\ at end of string";
5841 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005842 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005843 }
5844 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005845 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005846 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005847 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005848 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005849 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005850 continue;
5851
5852 error:
5853 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005854 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005855 errors, &errorHandler,
5856 "unicodeescape", message,
5857 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005858 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005859 goto onError;
5860 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005862#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005863
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005864 Py_XDECREF(errorHandler);
5865 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005866 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005867
Benjamin Peterson29060642009-01-31 22:14:21 +00005868 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005869 PyErr_SetString(
5870 PyExc_UnicodeError,
5871 "\\N escapes not supported (can't load unicodedata module)"
5872 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005873 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005874 Py_XDECREF(errorHandler);
5875 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005876 return NULL;
5877
Benjamin Peterson29060642009-01-31 22:14:21 +00005878 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005879 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005880 Py_XDECREF(errorHandler);
5881 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882 return NULL;
5883}
5884
5885/* Return a Unicode-Escape string version of the Unicode object.
5886
5887 If quotes is true, the string is enclosed in u"" or u'' quotes as
5888 appropriate.
5889
5890*/
5891
Alexander Belopolsky40018472011-02-26 01:02:56 +00005892PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005893PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005895 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005896 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005898 int kind;
5899 void *data;
5900 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901
Ezio Melottie7f90372012-10-05 03:33:31 +03005902 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005903 escape.
5904
Ezio Melottie7f90372012-10-05 03:33:31 +03005905 For UCS1 strings it's '\xxx', 4 bytes per source character.
5906 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5907 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005908 */
5909
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005910 if (!PyUnicode_Check(unicode)) {
5911 PyErr_BadArgument();
5912 return NULL;
5913 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005914 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005915 return NULL;
5916 len = PyUnicode_GET_LENGTH(unicode);
5917 kind = PyUnicode_KIND(unicode);
5918 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005919 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005920 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5921 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5922 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5923 }
5924
5925 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005926 return PyBytes_FromStringAndSize(NULL, 0);
5927
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005928 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005929 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005930
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005931 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005932 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005933 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005934 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935 if (repr == NULL)
5936 return NULL;
5937
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005938 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005940 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005941 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005942
Walter Dörwald79e913e2007-05-12 11:08:06 +00005943 /* Escape backslashes */
5944 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 *p++ = '\\';
5946 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005947 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005948 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005949
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005950 /* Map 21-bit characters to '\U00xxxxxx' */
5951 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005952 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005953 *p++ = '\\';
5954 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005955 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5956 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5957 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5958 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5959 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5960 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5961 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5962 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005963 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005964 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005965
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005967 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 *p++ = '\\';
5969 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005970 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5971 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5972 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5973 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005975
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005976 /* Map special whitespace to '\t', \n', '\r' */
5977 else if (ch == '\t') {
5978 *p++ = '\\';
5979 *p++ = 't';
5980 }
5981 else if (ch == '\n') {
5982 *p++ = '\\';
5983 *p++ = 'n';
5984 }
5985 else if (ch == '\r') {
5986 *p++ = '\\';
5987 *p++ = 'r';
5988 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005989
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005990 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005991 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005993 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005994 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5995 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005996 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005997
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998 /* Copy everything else as-is */
5999 else
6000 *p++ = (char) ch;
6001 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006003 assert(p - PyBytes_AS_STRING(repr) > 0);
6004 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6005 return NULL;
6006 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007}
6008
Alexander Belopolsky40018472011-02-26 01:02:56 +00006009PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006010PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6011 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006013 PyObject *result;
6014 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6015 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006017 result = PyUnicode_AsUnicodeEscapeString(tmp);
6018 Py_DECREF(tmp);
6019 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020}
6021
6022/* --- Raw Unicode Escape Codec ------------------------------------------- */
6023
Alexander Belopolsky40018472011-02-26 01:02:56 +00006024PyObject *
6025PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006026 Py_ssize_t size,
6027 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006029 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006030 Py_ssize_t startinpos;
6031 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006032 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033 const char *end;
6034 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006035 PyObject *errorHandler = NULL;
6036 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006037
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006038 if (size == 0)
6039 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006040
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 /* Escaped strings will always be longer than the resulting
6042 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006043 length after conversion to the true value. (But decoding error
6044 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006045 _PyUnicodeWriter_Init(&writer);
6046 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006047
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048 end = s + size;
6049 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006050 unsigned char c;
6051 Py_UCS4 x;
6052 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006053 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054
Benjamin Peterson29060642009-01-31 22:14:21 +00006055 /* Non-escape characters are interpreted as Unicode ordinals */
6056 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006057 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006058 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006059 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006060 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006061 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006062 startinpos = s-starts;
6063
6064 /* \u-escapes are only interpreted iff the number of leading
6065 backslashes if odd */
6066 bs = s;
6067 for (;s < end;) {
6068 if (*s != '\\')
6069 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006070 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006071 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006072 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006073 }
6074 if (((s - bs) & 1) == 0 ||
6075 s >= end ||
6076 (*s != 'u' && *s != 'U')) {
6077 continue;
6078 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006079 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006080 count = *s=='u' ? 4 : 8;
6081 s++;
6082
6083 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 for (x = 0, i = 0; i < count; ++i, ++s) {
6085 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006086 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006087 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006088 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006089 errors, &errorHandler,
6090 "rawunicodeescape", "truncated \\uXXXX",
6091 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006092 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006093 goto onError;
6094 goto nextByte;
6095 }
6096 x = (x<<4) & ~0xF;
6097 if (c >= '0' && c <= '9')
6098 x += c - '0';
6099 else if (c >= 'a' && c <= 'f')
6100 x += 10 + c - 'a';
6101 else
6102 x += 10 + c - 'A';
6103 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006104 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006105 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006106 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006107 }
6108 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006109 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006110 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006111 errors, &errorHandler,
6112 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006113 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006114 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006115 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006116 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006117 nextByte:
6118 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006120 Py_XDECREF(errorHandler);
6121 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006122 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006123
Benjamin Peterson29060642009-01-31 22:14:21 +00006124 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006125 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006126 Py_XDECREF(errorHandler);
6127 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128 return NULL;
6129}
6130
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006131
Alexander Belopolsky40018472011-02-26 01:02:56 +00006132PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006133PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006135 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136 char *p;
6137 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006138 Py_ssize_t expandsize, pos;
6139 int kind;
6140 void *data;
6141 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006143 if (!PyUnicode_Check(unicode)) {
6144 PyErr_BadArgument();
6145 return NULL;
6146 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006147 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006148 return NULL;
6149 kind = PyUnicode_KIND(unicode);
6150 data = PyUnicode_DATA(unicode);
6151 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006152 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6153 bytes, and 1 byte characters 4. */
6154 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006155
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006156 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006157 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006158
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006159 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 if (repr == NULL)
6161 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006162 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006163 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006165 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006166 for (pos = 0; pos < len; pos++) {
6167 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006168 /* Map 32-bit characters to '\Uxxxxxxxx' */
6169 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006170 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006171 *p++ = '\\';
6172 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006173 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6174 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6175 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6176 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6177 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6178 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6179 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6180 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006181 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006182 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006183 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184 *p++ = '\\';
6185 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006186 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6187 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6188 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6189 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006191 /* Copy everything else as-is */
6192 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193 *p++ = (char) ch;
6194 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006195
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006196 assert(p > q);
6197 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006198 return NULL;
6199 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200}
6201
Alexander Belopolsky40018472011-02-26 01:02:56 +00006202PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006203PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6204 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006206 PyObject *result;
6207 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6208 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006209 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006210 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6211 Py_DECREF(tmp);
6212 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213}
6214
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006215/* --- Unicode Internal Codec ------------------------------------------- */
6216
Alexander Belopolsky40018472011-02-26 01:02:56 +00006217PyObject *
6218_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006219 Py_ssize_t size,
6220 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006221{
6222 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006223 Py_ssize_t startinpos;
6224 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006225 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006226 const char *end;
6227 const char *reason;
6228 PyObject *errorHandler = NULL;
6229 PyObject *exc = NULL;
6230
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006231 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006232 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006233 1))
6234 return NULL;
6235
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006236 if (size == 0)
6237 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006238
Victor Stinner8f674cc2013-04-17 23:02:17 +02006239 _PyUnicodeWriter_Init(&writer);
6240 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6241 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006242 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006243 }
6244 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006245
Victor Stinner8f674cc2013-04-17 23:02:17 +02006246 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006247 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006248 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006249 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006250 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006251 endinpos = end-starts;
6252 reason = "truncated input";
6253 goto error;
6254 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006255 /* We copy the raw representation one byte at a time because the
6256 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006257 ((char *) &uch)[0] = s[0];
6258 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006259#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006260 ((char *) &uch)[2] = s[2];
6261 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006262#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006263 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006264#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006265 /* We have to sanity check the raw data, otherwise doom looms for
6266 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006267 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006268 endinpos = s - starts + Py_UNICODE_SIZE;
6269 reason = "illegal code point (> 0x10FFFF)";
6270 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006271 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006272#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006273 s += Py_UNICODE_SIZE;
6274#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006275 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006276 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006277 Py_UNICODE uch2;
6278 ((char *) &uch2)[0] = s[0];
6279 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006280 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006281 {
Victor Stinner551ac952011-11-29 22:58:13 +01006282 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006283 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006284 }
6285 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006286#endif
6287
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006288 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006289 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006290 continue;
6291
6292 error:
6293 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006294 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006295 errors, &errorHandler,
6296 "unicode_internal", reason,
6297 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006298 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006299 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006300 }
6301
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006302 Py_XDECREF(errorHandler);
6303 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006304 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006305
Benjamin Peterson29060642009-01-31 22:14:21 +00006306 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006307 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006308 Py_XDECREF(errorHandler);
6309 Py_XDECREF(exc);
6310 return NULL;
6311}
6312
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313/* --- Latin-1 Codec ------------------------------------------------------ */
6314
Alexander Belopolsky40018472011-02-26 01:02:56 +00006315PyObject *
6316PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006317 Py_ssize_t size,
6318 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006321 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322}
6323
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006324/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006325static void
6326make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006327 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006328 PyObject *unicode,
6329 Py_ssize_t startpos, Py_ssize_t endpos,
6330 const char *reason)
6331{
6332 if (*exceptionObject == NULL) {
6333 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006334 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006335 encoding, unicode, startpos, endpos, reason);
6336 }
6337 else {
6338 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6339 goto onError;
6340 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6341 goto onError;
6342 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6343 goto onError;
6344 return;
6345 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006346 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006347 }
6348}
6349
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006350/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006351static void
6352raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006353 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006354 PyObject *unicode,
6355 Py_ssize_t startpos, Py_ssize_t endpos,
6356 const char *reason)
6357{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006358 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006359 encoding, unicode, startpos, endpos, reason);
6360 if (*exceptionObject != NULL)
6361 PyCodec_StrictErrors(*exceptionObject);
6362}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006363
6364/* error handling callback helper:
6365 build arguments, call the callback and check the arguments,
6366 put the result into newpos and return the replacement string, which
6367 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006368static PyObject *
6369unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006370 PyObject **errorHandler,
6371 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006372 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006373 Py_ssize_t startpos, Py_ssize_t endpos,
6374 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006375{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006376 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006377 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006378 PyObject *restuple;
6379 PyObject *resunicode;
6380
6381 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006382 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006383 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006384 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006385 }
6386
Benjamin Petersonbac79492012-01-14 13:34:47 -05006387 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006388 return NULL;
6389 len = PyUnicode_GET_LENGTH(unicode);
6390
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006391 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006392 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006393 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006394 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006395
6396 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006397 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006398 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006399 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006400 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006401 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006402 Py_DECREF(restuple);
6403 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006404 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006405 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006406 &resunicode, newpos)) {
6407 Py_DECREF(restuple);
6408 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006409 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006410 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6411 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6412 Py_DECREF(restuple);
6413 return NULL;
6414 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006415 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006416 *newpos = len + *newpos;
6417 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006418 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006419 Py_DECREF(restuple);
6420 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006421 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006422 Py_INCREF(resunicode);
6423 Py_DECREF(restuple);
6424 return resunicode;
6425}
6426
Alexander Belopolsky40018472011-02-26 01:02:56 +00006427static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006428unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006429 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006430 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006431{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006432 /* input state */
6433 Py_ssize_t pos=0, size;
6434 int kind;
6435 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006436 /* output object */
6437 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006438 /* pointer into the output */
6439 char *str;
6440 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006441 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006442 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6443 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006444 PyObject *errorHandler = NULL;
6445 PyObject *exc = NULL;
6446 /* the following variable is used for caching string comparisons
6447 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6448 int known_errorHandler = -1;
6449
Benjamin Petersonbac79492012-01-14 13:34:47 -05006450 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006451 return NULL;
6452 size = PyUnicode_GET_LENGTH(unicode);
6453 kind = PyUnicode_KIND(unicode);
6454 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006455 /* allocate enough for a simple encoding without
6456 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006457 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006458 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006459 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006460 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006461 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006462 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006463 ressize = size;
6464
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006465 while (pos < size) {
6466 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006467
Benjamin Peterson29060642009-01-31 22:14:21 +00006468 /* can we encode this? */
6469 if (c<limit) {
6470 /* no overflow check, because we know that the space is enough */
6471 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006472 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006473 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006474 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006475 Py_ssize_t requiredsize;
6476 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006477 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006478 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006479 Py_ssize_t collstart = pos;
6480 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 /* find all unecodable characters */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006482 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006483 ++collend;
6484 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6485 if (known_errorHandler==-1) {
6486 if ((errors==NULL) || (!strcmp(errors, "strict")))
6487 known_errorHandler = 1;
6488 else if (!strcmp(errors, "replace"))
6489 known_errorHandler = 2;
6490 else if (!strcmp(errors, "ignore"))
6491 known_errorHandler = 3;
6492 else if (!strcmp(errors, "xmlcharrefreplace"))
6493 known_errorHandler = 4;
6494 else
6495 known_errorHandler = 0;
6496 }
6497 switch (known_errorHandler) {
6498 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006499 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006500 goto onError;
6501 case 2: /* replace */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006502 while (collstart++ < collend)
Benjamin Peterson29060642009-01-31 22:14:21 +00006503 *str++ = '?'; /* fall through */
6504 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006505 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006506 break;
6507 case 4: /* xmlcharrefreplace */
6508 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006509 requiredsize = respos;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006510 /* determine replacement size */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006511 for (i = collstart; i < collend; ++i) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006512 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006513 Py_ssize_t incr;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006514 if (ch < 10)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006515 incr = 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006516 else if (ch < 100)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006517 incr = 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006518 else if (ch < 1000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006519 incr = 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006520 else if (ch < 10000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006521 incr = 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006522 else if (ch < 100000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006523 incr = 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006524 else if (ch < 1000000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006525 incr = 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006526 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006527 assert(ch <= MAX_UNICODE);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006528 incr = 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006529 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006530 if (requiredsize > PY_SSIZE_T_MAX - incr)
6531 goto overflow;
6532 requiredsize += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +00006533 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006534 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6535 goto overflow;
6536 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006537 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006538 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006539 requiredsize = 2*ressize;
6540 if (_PyBytes_Resize(&res, requiredsize))
6541 goto onError;
6542 str = PyBytes_AS_STRING(res) + respos;
6543 ressize = requiredsize;
6544 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006545 /* generate replacement */
6546 for (i = collstart; i < collend; ++i) {
6547 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006548 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006549 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006550 break;
6551 default:
6552 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006553 encoding, reason, unicode, &exc,
6554 collstart, collend, &newpos);
6555 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006556 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006557 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006558 if (PyBytes_Check(repunicode)) {
6559 /* Directly copy bytes result to output. */
6560 repsize = PyBytes_Size(repunicode);
6561 if (repsize > 1) {
6562 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006563 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006564 if (ressize > PY_SSIZE_T_MAX - repsize - 1) {
6565 Py_DECREF(repunicode);
6566 goto overflow;
6567 }
Martin v. Löwis011e8422009-05-05 04:43:17 +00006568 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6569 Py_DECREF(repunicode);
6570 goto onError;
6571 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006572 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006573 ressize += repsize-1;
6574 }
6575 memcpy(str, PyBytes_AsString(repunicode), repsize);
6576 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006577 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006578 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006579 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006580 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 /* need more space? (at least enough for what we
6582 have+the replacement+the rest of the string, so
6583 we won't have to check space for encodable characters) */
6584 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006585 repsize = PyUnicode_GET_LENGTH(repunicode);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006586 requiredsize = respos;
6587 if (requiredsize > PY_SSIZE_T_MAX - repsize)
6588 goto overflow;
6589 requiredsize += repsize;
6590 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6591 goto overflow;
6592 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006593 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006594 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006595 requiredsize = 2*ressize;
6596 if (_PyBytes_Resize(&res, requiredsize)) {
6597 Py_DECREF(repunicode);
6598 goto onError;
6599 }
6600 str = PyBytes_AS_STRING(res) + respos;
6601 ressize = requiredsize;
6602 }
6603 /* check if there is anything unencodable in the replacement
6604 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006605 for (i = 0; repsize-->0; ++i, ++str) {
6606 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006607 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006608 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006609 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006610 Py_DECREF(repunicode);
6611 goto onError;
6612 }
6613 *str = (char)c;
6614 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006615 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006616 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006617 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006618 }
6619 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006620 /* Resize if we allocated to much */
6621 size = str - PyBytes_AS_STRING(res);
6622 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006623 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006624 if (_PyBytes_Resize(&res, size) < 0)
6625 goto onError;
6626 }
6627
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006628 Py_XDECREF(errorHandler);
6629 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006630 return res;
6631
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006632 overflow:
6633 PyErr_SetString(PyExc_OverflowError,
6634 "encoded result is too long for a Python string");
6635
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006636 onError:
6637 Py_XDECREF(res);
6638 Py_XDECREF(errorHandler);
6639 Py_XDECREF(exc);
6640 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006641}
6642
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006643/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006644PyObject *
6645PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006646 Py_ssize_t size,
6647 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006649 PyObject *result;
6650 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6651 if (unicode == NULL)
6652 return NULL;
6653 result = unicode_encode_ucs1(unicode, errors, 256);
6654 Py_DECREF(unicode);
6655 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656}
6657
Alexander Belopolsky40018472011-02-26 01:02:56 +00006658PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006659_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660{
6661 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006662 PyErr_BadArgument();
6663 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006665 if (PyUnicode_READY(unicode) == -1)
6666 return NULL;
6667 /* Fast path: if it is a one-byte string, construct
6668 bytes object directly. */
6669 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6670 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6671 PyUnicode_GET_LENGTH(unicode));
6672 /* Non-Latin-1 characters present. Defer to above function to
6673 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006674 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006675}
6676
6677PyObject*
6678PyUnicode_AsLatin1String(PyObject *unicode)
6679{
6680 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681}
6682
6683/* --- 7-bit ASCII Codec -------------------------------------------------- */
6684
Alexander Belopolsky40018472011-02-26 01:02:56 +00006685PyObject *
6686PyUnicode_DecodeASCII(const char *s,
6687 Py_ssize_t size,
6688 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006690 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006691 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006692 int kind;
6693 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006694 Py_ssize_t startinpos;
6695 Py_ssize_t endinpos;
6696 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006697 const char *e;
6698 PyObject *errorHandler = NULL;
6699 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006700
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006702 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006703
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006705 if (size == 1 && (unsigned char)s[0] < 128)
6706 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006707
Victor Stinner8f674cc2013-04-17 23:02:17 +02006708 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006709 writer.min_length = size;
6710 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006711 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006712
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006713 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006714 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006715 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006716 writer.pos = outpos;
6717 if (writer.pos == size)
6718 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006719
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006720 s += writer.pos;
6721 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006722 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006723 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006724 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006725 PyUnicode_WRITE(kind, data, writer.pos, c);
6726 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006727 ++s;
6728 }
6729 else {
6730 startinpos = s-starts;
6731 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006732 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006733 errors, &errorHandler,
6734 "ascii", "ordinal not in range(128)",
6735 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006736 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006737 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006738 kind = writer.kind;
6739 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006740 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006742 Py_XDECREF(errorHandler);
6743 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006744 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006745
Benjamin Peterson29060642009-01-31 22:14:21 +00006746 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006747 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006748 Py_XDECREF(errorHandler);
6749 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750 return NULL;
6751}
6752
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006753/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006754PyObject *
6755PyUnicode_EncodeASCII(const Py_UNICODE *p,
6756 Py_ssize_t size,
6757 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006759 PyObject *result;
6760 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6761 if (unicode == NULL)
6762 return NULL;
6763 result = unicode_encode_ucs1(unicode, errors, 128);
6764 Py_DECREF(unicode);
6765 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766}
6767
Alexander Belopolsky40018472011-02-26 01:02:56 +00006768PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006769_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770{
6771 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006772 PyErr_BadArgument();
6773 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006775 if (PyUnicode_READY(unicode) == -1)
6776 return NULL;
6777 /* Fast path: if it is an ASCII-only string, construct bytes object
6778 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006779 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006780 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6781 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006782 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006783}
6784
6785PyObject *
6786PyUnicode_AsASCIIString(PyObject *unicode)
6787{
6788 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789}
6790
Victor Stinner99b95382011-07-04 14:23:54 +02006791#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006792
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006793/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006794
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006795#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006796#define NEED_RETRY
6797#endif
6798
Victor Stinner3a50e702011-10-18 21:21:00 +02006799#ifndef WC_ERR_INVALID_CHARS
6800# define WC_ERR_INVALID_CHARS 0x0080
6801#endif
6802
6803static char*
6804code_page_name(UINT code_page, PyObject **obj)
6805{
6806 *obj = NULL;
6807 if (code_page == CP_ACP)
6808 return "mbcs";
6809 if (code_page == CP_UTF7)
6810 return "CP_UTF7";
6811 if (code_page == CP_UTF8)
6812 return "CP_UTF8";
6813
6814 *obj = PyBytes_FromFormat("cp%u", code_page);
6815 if (*obj == NULL)
6816 return NULL;
6817 return PyBytes_AS_STRING(*obj);
6818}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006819
Victor Stinner3a50e702011-10-18 21:21:00 +02006820static DWORD
6821decode_code_page_flags(UINT code_page)
6822{
6823 if (code_page == CP_UTF7) {
6824 /* The CP_UTF7 decoder only supports flags=0 */
6825 return 0;
6826 }
6827 else
6828 return MB_ERR_INVALID_CHARS;
6829}
6830
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006831/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006832 * Decode a byte string from a Windows code page into unicode object in strict
6833 * mode.
6834 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006835 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6836 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006837 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006838static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006839decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006840 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006841 const char *in,
6842 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006843{
Victor Stinner3a50e702011-10-18 21:21:00 +02006844 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006845 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006846 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006847
6848 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006849 assert(insize > 0);
6850 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6851 if (outsize <= 0)
6852 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006853
6854 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006855 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006856 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006857 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006858 if (*v == NULL)
6859 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006860 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006861 }
6862 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006863 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006864 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006865 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006866 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006867 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006868 }
6869
6870 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006871 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6872 if (outsize <= 0)
6873 goto error;
6874 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006875
Victor Stinner3a50e702011-10-18 21:21:00 +02006876error:
6877 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6878 return -2;
6879 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006880 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006881}
6882
Victor Stinner3a50e702011-10-18 21:21:00 +02006883/*
6884 * Decode a byte string from a code page into unicode object with an error
6885 * handler.
6886 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006887 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006888 * UnicodeDecodeError exception and returns -1 on error.
6889 */
6890static int
6891decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006892 PyObject **v,
6893 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01006894 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02006895{
6896 const char *startin = in;
6897 const char *endin = in + size;
6898 const DWORD flags = decode_code_page_flags(code_page);
6899 /* Ideally, we should get reason from FormatMessage. This is the Windows
6900 2000 English version of the message. */
6901 const char *reason = "No mapping for the Unicode character exists "
6902 "in the target code page.";
6903 /* each step cannot decode more than 1 character, but a character can be
6904 represented as a surrogate pair */
6905 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02006906 int insize;
6907 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006908 PyObject *errorHandler = NULL;
6909 PyObject *exc = NULL;
6910 PyObject *encoding_obj = NULL;
6911 char *encoding;
6912 DWORD err;
6913 int ret = -1;
6914
6915 assert(size > 0);
6916
6917 encoding = code_page_name(code_page, &encoding_obj);
6918 if (encoding == NULL)
6919 return -1;
6920
Victor Stinner7d00cc12014-03-17 23:08:06 +01006921 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02006922 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6923 UnicodeDecodeError. */
6924 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6925 if (exc != NULL) {
6926 PyCodec_StrictErrors(exc);
6927 Py_CLEAR(exc);
6928 }
6929 goto error;
6930 }
6931
6932 if (*v == NULL) {
6933 /* Create unicode object */
6934 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6935 PyErr_NoMemory();
6936 goto error;
6937 }
Victor Stinnerab595942011-12-17 04:59:06 +01006938 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006939 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006940 if (*v == NULL)
6941 goto error;
6942 startout = PyUnicode_AS_UNICODE(*v);
6943 }
6944 else {
6945 /* Extend unicode object */
6946 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6947 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6948 PyErr_NoMemory();
6949 goto error;
6950 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006951 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006952 goto error;
6953 startout = PyUnicode_AS_UNICODE(*v) + n;
6954 }
6955
6956 /* Decode the byte string character per character */
6957 out = startout;
6958 while (in < endin)
6959 {
6960 /* Decode a character */
6961 insize = 1;
6962 do
6963 {
6964 outsize = MultiByteToWideChar(code_page, flags,
6965 in, insize,
6966 buffer, Py_ARRAY_LENGTH(buffer));
6967 if (outsize > 0)
6968 break;
6969 err = GetLastError();
6970 if (err != ERROR_NO_UNICODE_TRANSLATION
6971 && err != ERROR_INSUFFICIENT_BUFFER)
6972 {
6973 PyErr_SetFromWindowsErr(0);
6974 goto error;
6975 }
6976 insize++;
6977 }
6978 /* 4=maximum length of a UTF-8 sequence */
6979 while (insize <= 4 && (in + insize) <= endin);
6980
6981 if (outsize <= 0) {
6982 Py_ssize_t startinpos, endinpos, outpos;
6983
Victor Stinner7d00cc12014-03-17 23:08:06 +01006984 /* last character in partial decode? */
6985 if (in + insize >= endin && !final)
6986 break;
6987
Victor Stinner3a50e702011-10-18 21:21:00 +02006988 startinpos = in - startin;
6989 endinpos = startinpos + 1;
6990 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006991 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006992 errors, &errorHandler,
6993 encoding, reason,
6994 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006995 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006996 {
6997 goto error;
6998 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006999 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007000 }
7001 else {
7002 in += insize;
7003 memcpy(out, buffer, outsize * sizeof(wchar_t));
7004 out += outsize;
7005 }
7006 }
7007
7008 /* write a NUL character at the end */
7009 *out = 0;
7010
7011 /* Extend unicode object */
7012 outsize = out - startout;
7013 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007014 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007015 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007016 /* (in - startin) <= size and size is an int */
7017 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007018
7019error:
7020 Py_XDECREF(encoding_obj);
7021 Py_XDECREF(errorHandler);
7022 Py_XDECREF(exc);
7023 return ret;
7024}
7025
Victor Stinner3a50e702011-10-18 21:21:00 +02007026static PyObject *
7027decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007028 const char *s, Py_ssize_t size,
7029 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007030{
Victor Stinner76a31a62011-11-04 00:05:13 +01007031 PyObject *v = NULL;
7032 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007033
Victor Stinner3a50e702011-10-18 21:21:00 +02007034 if (code_page < 0) {
7035 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7036 return NULL;
7037 }
7038
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007039 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007040 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007041
Victor Stinner76a31a62011-11-04 00:05:13 +01007042 do
7043 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007044#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007045 if (size > INT_MAX) {
7046 chunk_size = INT_MAX;
7047 final = 0;
7048 done = 0;
7049 }
7050 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007051#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007052 {
7053 chunk_size = (int)size;
7054 final = (consumed == NULL);
7055 done = 1;
7056 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007057
Victor Stinner76a31a62011-11-04 00:05:13 +01007058 if (chunk_size == 0 && done) {
7059 if (v != NULL)
7060 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007061 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007062 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007063
Victor Stinner76a31a62011-11-04 00:05:13 +01007064 converted = decode_code_page_strict(code_page, &v,
7065 s, chunk_size);
7066 if (converted == -2)
7067 converted = decode_code_page_errors(code_page, &v,
7068 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007069 errors, final);
7070 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007071
7072 if (converted < 0) {
7073 Py_XDECREF(v);
7074 return NULL;
7075 }
7076
7077 if (consumed)
7078 *consumed += converted;
7079
7080 s += converted;
7081 size -= converted;
7082 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007083
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007084 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007085}
7086
Alexander Belopolsky40018472011-02-26 01:02:56 +00007087PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007088PyUnicode_DecodeCodePageStateful(int code_page,
7089 const char *s,
7090 Py_ssize_t size,
7091 const char *errors,
7092 Py_ssize_t *consumed)
7093{
7094 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7095}
7096
7097PyObject *
7098PyUnicode_DecodeMBCSStateful(const char *s,
7099 Py_ssize_t size,
7100 const char *errors,
7101 Py_ssize_t *consumed)
7102{
7103 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7104}
7105
7106PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007107PyUnicode_DecodeMBCS(const char *s,
7108 Py_ssize_t size,
7109 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007110{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007111 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7112}
7113
Victor Stinner3a50e702011-10-18 21:21:00 +02007114static DWORD
7115encode_code_page_flags(UINT code_page, const char *errors)
7116{
7117 if (code_page == CP_UTF8) {
7118 if (winver.dwMajorVersion >= 6)
7119 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7120 and later */
7121 return WC_ERR_INVALID_CHARS;
7122 else
7123 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7124 return 0;
7125 }
7126 else if (code_page == CP_UTF7) {
7127 /* CP_UTF7 only supports flags=0 */
7128 return 0;
7129 }
7130 else {
7131 if (errors != NULL && strcmp(errors, "replace") == 0)
7132 return 0;
7133 else
7134 return WC_NO_BEST_FIT_CHARS;
7135 }
7136}
7137
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007138/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007139 * Encode a Unicode string to a Windows code page into a byte string in strict
7140 * mode.
7141 *
7142 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007143 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007144 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007145static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007146encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007147 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007148 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007149{
Victor Stinner554f3f02010-06-16 23:33:54 +00007150 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007151 BOOL *pusedDefaultChar = &usedDefaultChar;
7152 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007153 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007154 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007155 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007156 const DWORD flags = encode_code_page_flags(code_page, NULL);
7157 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007158 /* Create a substring so that we can get the UTF-16 representation
7159 of just the slice under consideration. */
7160 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007161
Martin v. Löwis3d325192011-11-04 18:23:06 +01007162 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007163
Victor Stinner3a50e702011-10-18 21:21:00 +02007164 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007165 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007166 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007167 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007168
Victor Stinner2fc507f2011-11-04 20:06:39 +01007169 substring = PyUnicode_Substring(unicode, offset, offset+len);
7170 if (substring == NULL)
7171 return -1;
7172 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7173 if (p == NULL) {
7174 Py_DECREF(substring);
7175 return -1;
7176 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007177 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007178
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007179 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007180 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007181 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007182 NULL, 0,
7183 NULL, pusedDefaultChar);
7184 if (outsize <= 0)
7185 goto error;
7186 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007187 if (pusedDefaultChar && *pusedDefaultChar) {
7188 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007189 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007190 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007191
Victor Stinner3a50e702011-10-18 21:21:00 +02007192 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007193 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007194 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007195 if (*outbytes == NULL) {
7196 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007197 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007198 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007199 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007200 }
7201 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007202 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007203 const Py_ssize_t n = PyBytes_Size(*outbytes);
7204 if (outsize > PY_SSIZE_T_MAX - n) {
7205 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007206 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007207 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007208 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007209 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7210 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007211 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007212 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007213 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007214 }
7215
7216 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007217 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007218 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007219 out, outsize,
7220 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007221 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007222 if (outsize <= 0)
7223 goto error;
7224 if (pusedDefaultChar && *pusedDefaultChar)
7225 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007226 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007227
Victor Stinner3a50e702011-10-18 21:21:00 +02007228error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007229 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007230 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7231 return -2;
7232 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007233 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007234}
7235
Victor Stinner3a50e702011-10-18 21:21:00 +02007236/*
7237 * Encode a Unicode string to a Windows code page into a byte string using a
7238 * error handler.
7239 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007240 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007241 * -1 on other error.
7242 */
7243static int
7244encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007245 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007246 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007247{
Victor Stinner3a50e702011-10-18 21:21:00 +02007248 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007249 Py_ssize_t pos = unicode_offset;
7250 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007251 /* Ideally, we should get reason from FormatMessage. This is the Windows
7252 2000 English version of the message. */
7253 const char *reason = "invalid character";
7254 /* 4=maximum length of a UTF-8 sequence */
7255 char buffer[4];
7256 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7257 Py_ssize_t outsize;
7258 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007259 PyObject *errorHandler = NULL;
7260 PyObject *exc = NULL;
7261 PyObject *encoding_obj = NULL;
7262 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007263 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007264 PyObject *rep;
7265 int ret = -1;
7266
7267 assert(insize > 0);
7268
7269 encoding = code_page_name(code_page, &encoding_obj);
7270 if (encoding == NULL)
7271 return -1;
7272
7273 if (errors == NULL || strcmp(errors, "strict") == 0) {
7274 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7275 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007276 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007277 if (exc != NULL) {
7278 PyCodec_StrictErrors(exc);
7279 Py_DECREF(exc);
7280 }
7281 Py_XDECREF(encoding_obj);
7282 return -1;
7283 }
7284
7285 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7286 pusedDefaultChar = &usedDefaultChar;
7287 else
7288 pusedDefaultChar = NULL;
7289
7290 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7291 PyErr_NoMemory();
7292 goto error;
7293 }
7294 outsize = insize * Py_ARRAY_LENGTH(buffer);
7295
7296 if (*outbytes == NULL) {
7297 /* Create string object */
7298 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7299 if (*outbytes == NULL)
7300 goto error;
7301 out = PyBytes_AS_STRING(*outbytes);
7302 }
7303 else {
7304 /* Extend string object */
7305 Py_ssize_t n = PyBytes_Size(*outbytes);
7306 if (n > PY_SSIZE_T_MAX - outsize) {
7307 PyErr_NoMemory();
7308 goto error;
7309 }
7310 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7311 goto error;
7312 out = PyBytes_AS_STRING(*outbytes) + n;
7313 }
7314
7315 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007316 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007317 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007318 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7319 wchar_t chars[2];
7320 int charsize;
7321 if (ch < 0x10000) {
7322 chars[0] = (wchar_t)ch;
7323 charsize = 1;
7324 }
7325 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007326 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7327 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007328 charsize = 2;
7329 }
7330
Victor Stinner3a50e702011-10-18 21:21:00 +02007331 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007332 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007333 buffer, Py_ARRAY_LENGTH(buffer),
7334 NULL, pusedDefaultChar);
7335 if (outsize > 0) {
7336 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7337 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007338 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007339 memcpy(out, buffer, outsize);
7340 out += outsize;
7341 continue;
7342 }
7343 }
7344 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7345 PyErr_SetFromWindowsErr(0);
7346 goto error;
7347 }
7348
Victor Stinner3a50e702011-10-18 21:21:00 +02007349 rep = unicode_encode_call_errorhandler(
7350 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007351 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007352 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007353 if (rep == NULL)
7354 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007355 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007356
7357 if (PyBytes_Check(rep)) {
7358 outsize = PyBytes_GET_SIZE(rep);
7359 if (outsize != 1) {
7360 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7361 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7362 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7363 Py_DECREF(rep);
7364 goto error;
7365 }
7366 out = PyBytes_AS_STRING(*outbytes) + offset;
7367 }
7368 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7369 out += outsize;
7370 }
7371 else {
7372 Py_ssize_t i;
7373 enum PyUnicode_Kind kind;
7374 void *data;
7375
Benjamin Petersonbac79492012-01-14 13:34:47 -05007376 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007377 Py_DECREF(rep);
7378 goto error;
7379 }
7380
7381 outsize = PyUnicode_GET_LENGTH(rep);
7382 if (outsize != 1) {
7383 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7384 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7385 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7386 Py_DECREF(rep);
7387 goto error;
7388 }
7389 out = PyBytes_AS_STRING(*outbytes) + offset;
7390 }
7391 kind = PyUnicode_KIND(rep);
7392 data = PyUnicode_DATA(rep);
7393 for (i=0; i < outsize; i++) {
7394 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7395 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007396 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007397 encoding, unicode,
7398 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007399 "unable to encode error handler result to ASCII");
7400 Py_DECREF(rep);
7401 goto error;
7402 }
7403 *out = (unsigned char)ch;
7404 out++;
7405 }
7406 }
7407 Py_DECREF(rep);
7408 }
7409 /* write a NUL byte */
7410 *out = 0;
7411 outsize = out - PyBytes_AS_STRING(*outbytes);
7412 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7413 if (_PyBytes_Resize(outbytes, outsize) < 0)
7414 goto error;
7415 ret = 0;
7416
7417error:
7418 Py_XDECREF(encoding_obj);
7419 Py_XDECREF(errorHandler);
7420 Py_XDECREF(exc);
7421 return ret;
7422}
7423
Victor Stinner3a50e702011-10-18 21:21:00 +02007424static PyObject *
7425encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007426 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007427 const char *errors)
7428{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007429 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007430 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007431 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007432 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007433
Victor Stinner29dacf22015-01-26 16:41:32 +01007434 if (!PyUnicode_Check(unicode)) {
7435 PyErr_BadArgument();
7436 return NULL;
7437 }
7438
Benjamin Petersonbac79492012-01-14 13:34:47 -05007439 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007440 return NULL;
7441 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007442
Victor Stinner3a50e702011-10-18 21:21:00 +02007443 if (code_page < 0) {
7444 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7445 return NULL;
7446 }
7447
Martin v. Löwis3d325192011-11-04 18:23:06 +01007448 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007449 return PyBytes_FromStringAndSize(NULL, 0);
7450
Victor Stinner7581cef2011-11-03 22:32:33 +01007451 offset = 0;
7452 do
7453 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007454#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007455 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007456 chunks. */
7457 if (len > INT_MAX/2) {
7458 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007459 done = 0;
7460 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007461 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007462#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007463 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007464 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007465 done = 1;
7466 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007467
Victor Stinner76a31a62011-11-04 00:05:13 +01007468 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007469 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007470 errors);
7471 if (ret == -2)
7472 ret = encode_code_page_errors(code_page, &outbytes,
7473 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007474 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007475 if (ret < 0) {
7476 Py_XDECREF(outbytes);
7477 return NULL;
7478 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007479
Victor Stinner7581cef2011-11-03 22:32:33 +01007480 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007481 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007482 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007483
Victor Stinner3a50e702011-10-18 21:21:00 +02007484 return outbytes;
7485}
7486
7487PyObject *
7488PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7489 Py_ssize_t size,
7490 const char *errors)
7491{
Victor Stinner7581cef2011-11-03 22:32:33 +01007492 PyObject *unicode, *res;
7493 unicode = PyUnicode_FromUnicode(p, size);
7494 if (unicode == NULL)
7495 return NULL;
7496 res = encode_code_page(CP_ACP, unicode, errors);
7497 Py_DECREF(unicode);
7498 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007499}
7500
7501PyObject *
7502PyUnicode_EncodeCodePage(int code_page,
7503 PyObject *unicode,
7504 const char *errors)
7505{
Victor Stinner7581cef2011-11-03 22:32:33 +01007506 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007507}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007508
Alexander Belopolsky40018472011-02-26 01:02:56 +00007509PyObject *
7510PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007511{
Victor Stinner7581cef2011-11-03 22:32:33 +01007512 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007513}
7514
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007515#undef NEED_RETRY
7516
Victor Stinner99b95382011-07-04 14:23:54 +02007517#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007518
Guido van Rossumd57fd912000-03-10 22:53:23 +00007519/* --- Character Mapping Codec -------------------------------------------- */
7520
Victor Stinnerfb161b12013-04-18 01:44:27 +02007521static int
7522charmap_decode_string(const char *s,
7523 Py_ssize_t size,
7524 PyObject *mapping,
7525 const char *errors,
7526 _PyUnicodeWriter *writer)
7527{
7528 const char *starts = s;
7529 const char *e;
7530 Py_ssize_t startinpos, endinpos;
7531 PyObject *errorHandler = NULL, *exc = NULL;
7532 Py_ssize_t maplen;
7533 enum PyUnicode_Kind mapkind;
7534 void *mapdata;
7535 Py_UCS4 x;
7536 unsigned char ch;
7537
7538 if (PyUnicode_READY(mapping) == -1)
7539 return -1;
7540
7541 maplen = PyUnicode_GET_LENGTH(mapping);
7542 mapdata = PyUnicode_DATA(mapping);
7543 mapkind = PyUnicode_KIND(mapping);
7544
7545 e = s + size;
7546
7547 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7548 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7549 * is disabled in encoding aliases, latin1 is preferred because
7550 * its implementation is faster. */
7551 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7552 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7553 Py_UCS4 maxchar = writer->maxchar;
7554
7555 assert (writer->kind == PyUnicode_1BYTE_KIND);
7556 while (s < e) {
7557 ch = *s;
7558 x = mapdata_ucs1[ch];
7559 if (x > maxchar) {
7560 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7561 goto onError;
7562 maxchar = writer->maxchar;
7563 outdata = (Py_UCS1 *)writer->data;
7564 }
7565 outdata[writer->pos] = x;
7566 writer->pos++;
7567 ++s;
7568 }
7569 return 0;
7570 }
7571
7572 while (s < e) {
7573 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7574 enum PyUnicode_Kind outkind = writer->kind;
7575 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7576 if (outkind == PyUnicode_1BYTE_KIND) {
7577 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7578 Py_UCS4 maxchar = writer->maxchar;
7579 while (s < e) {
7580 ch = *s;
7581 x = mapdata_ucs2[ch];
7582 if (x > maxchar)
7583 goto Error;
7584 outdata[writer->pos] = x;
7585 writer->pos++;
7586 ++s;
7587 }
7588 break;
7589 }
7590 else if (outkind == PyUnicode_2BYTE_KIND) {
7591 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7592 while (s < e) {
7593 ch = *s;
7594 x = mapdata_ucs2[ch];
7595 if (x == 0xFFFE)
7596 goto Error;
7597 outdata[writer->pos] = x;
7598 writer->pos++;
7599 ++s;
7600 }
7601 break;
7602 }
7603 }
7604 ch = *s;
7605
7606 if (ch < maplen)
7607 x = PyUnicode_READ(mapkind, mapdata, ch);
7608 else
7609 x = 0xfffe; /* invalid value */
7610Error:
7611 if (x == 0xfffe)
7612 {
7613 /* undefined mapping */
7614 startinpos = s-starts;
7615 endinpos = startinpos+1;
7616 if (unicode_decode_call_errorhandler_writer(
7617 errors, &errorHandler,
7618 "charmap", "character maps to <undefined>",
7619 &starts, &e, &startinpos, &endinpos, &exc, &s,
7620 writer)) {
7621 goto onError;
7622 }
7623 continue;
7624 }
7625
7626 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7627 goto onError;
7628 ++s;
7629 }
7630 Py_XDECREF(errorHandler);
7631 Py_XDECREF(exc);
7632 return 0;
7633
7634onError:
7635 Py_XDECREF(errorHandler);
7636 Py_XDECREF(exc);
7637 return -1;
7638}
7639
7640static int
7641charmap_decode_mapping(const char *s,
7642 Py_ssize_t size,
7643 PyObject *mapping,
7644 const char *errors,
7645 _PyUnicodeWriter *writer)
7646{
7647 const char *starts = s;
7648 const char *e;
7649 Py_ssize_t startinpos, endinpos;
7650 PyObject *errorHandler = NULL, *exc = NULL;
7651 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007652 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007653
7654 e = s + size;
7655
7656 while (s < e) {
7657 ch = *s;
7658
7659 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7660 key = PyLong_FromLong((long)ch);
7661 if (key == NULL)
7662 goto onError;
7663
7664 item = PyObject_GetItem(mapping, key);
7665 Py_DECREF(key);
7666 if (item == NULL) {
7667 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7668 /* No mapping found means: mapping is undefined. */
7669 PyErr_Clear();
7670 goto Undefined;
7671 } else
7672 goto onError;
7673 }
7674
7675 /* Apply mapping */
7676 if (item == Py_None)
7677 goto Undefined;
7678 if (PyLong_Check(item)) {
7679 long value = PyLong_AS_LONG(item);
7680 if (value == 0xFFFE)
7681 goto Undefined;
7682 if (value < 0 || value > MAX_UNICODE) {
7683 PyErr_Format(PyExc_TypeError,
7684 "character mapping must be in range(0x%lx)",
7685 (unsigned long)MAX_UNICODE + 1);
7686 goto onError;
7687 }
7688
7689 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7690 goto onError;
7691 }
7692 else if (PyUnicode_Check(item)) {
7693 if (PyUnicode_READY(item) == -1)
7694 goto onError;
7695 if (PyUnicode_GET_LENGTH(item) == 1) {
7696 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7697 if (value == 0xFFFE)
7698 goto Undefined;
7699 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7700 goto onError;
7701 }
7702 else {
7703 writer->overallocate = 1;
7704 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7705 goto onError;
7706 }
7707 }
7708 else {
7709 /* wrong return value */
7710 PyErr_SetString(PyExc_TypeError,
7711 "character mapping must return integer, None or str");
7712 goto onError;
7713 }
7714 Py_CLEAR(item);
7715 ++s;
7716 continue;
7717
7718Undefined:
7719 /* undefined mapping */
7720 Py_CLEAR(item);
7721 startinpos = s-starts;
7722 endinpos = startinpos+1;
7723 if (unicode_decode_call_errorhandler_writer(
7724 errors, &errorHandler,
7725 "charmap", "character maps to <undefined>",
7726 &starts, &e, &startinpos, &endinpos, &exc, &s,
7727 writer)) {
7728 goto onError;
7729 }
7730 }
7731 Py_XDECREF(errorHandler);
7732 Py_XDECREF(exc);
7733 return 0;
7734
7735onError:
7736 Py_XDECREF(item);
7737 Py_XDECREF(errorHandler);
7738 Py_XDECREF(exc);
7739 return -1;
7740}
7741
Alexander Belopolsky40018472011-02-26 01:02:56 +00007742PyObject *
7743PyUnicode_DecodeCharmap(const char *s,
7744 Py_ssize_t size,
7745 PyObject *mapping,
7746 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007747{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007748 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007749
Guido van Rossumd57fd912000-03-10 22:53:23 +00007750 /* Default to Latin-1 */
7751 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007752 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007753
Guido van Rossumd57fd912000-03-10 22:53:23 +00007754 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007755 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007756 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007757 writer.min_length = size;
7758 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007759 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007760
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007761 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007762 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7763 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007764 }
7765 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007766 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7767 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007768 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007769 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007770
Benjamin Peterson29060642009-01-31 22:14:21 +00007771 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007772 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007773 return NULL;
7774}
7775
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007776/* Charmap encoding: the lookup table */
7777
Alexander Belopolsky40018472011-02-26 01:02:56 +00007778struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007779 PyObject_HEAD
7780 unsigned char level1[32];
7781 int count2, count3;
7782 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007783};
7784
7785static PyObject*
7786encoding_map_size(PyObject *obj, PyObject* args)
7787{
7788 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007789 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007790 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007791}
7792
7793static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007794 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007795 PyDoc_STR("Return the size (in bytes) of this object") },
7796 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007797};
7798
7799static void
7800encoding_map_dealloc(PyObject* o)
7801{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007802 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007803}
7804
7805static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007806 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007807 "EncodingMap", /*tp_name*/
7808 sizeof(struct encoding_map), /*tp_basicsize*/
7809 0, /*tp_itemsize*/
7810 /* methods */
7811 encoding_map_dealloc, /*tp_dealloc*/
7812 0, /*tp_print*/
7813 0, /*tp_getattr*/
7814 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007815 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007816 0, /*tp_repr*/
7817 0, /*tp_as_number*/
7818 0, /*tp_as_sequence*/
7819 0, /*tp_as_mapping*/
7820 0, /*tp_hash*/
7821 0, /*tp_call*/
7822 0, /*tp_str*/
7823 0, /*tp_getattro*/
7824 0, /*tp_setattro*/
7825 0, /*tp_as_buffer*/
7826 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7827 0, /*tp_doc*/
7828 0, /*tp_traverse*/
7829 0, /*tp_clear*/
7830 0, /*tp_richcompare*/
7831 0, /*tp_weaklistoffset*/
7832 0, /*tp_iter*/
7833 0, /*tp_iternext*/
7834 encoding_map_methods, /*tp_methods*/
7835 0, /*tp_members*/
7836 0, /*tp_getset*/
7837 0, /*tp_base*/
7838 0, /*tp_dict*/
7839 0, /*tp_descr_get*/
7840 0, /*tp_descr_set*/
7841 0, /*tp_dictoffset*/
7842 0, /*tp_init*/
7843 0, /*tp_alloc*/
7844 0, /*tp_new*/
7845 0, /*tp_free*/
7846 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007847};
7848
7849PyObject*
7850PyUnicode_BuildEncodingMap(PyObject* string)
7851{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007852 PyObject *result;
7853 struct encoding_map *mresult;
7854 int i;
7855 int need_dict = 0;
7856 unsigned char level1[32];
7857 unsigned char level2[512];
7858 unsigned char *mlevel1, *mlevel2, *mlevel3;
7859 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007860 int kind;
7861 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007862 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007863 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007864
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007865 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007866 PyErr_BadArgument();
7867 return NULL;
7868 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007869 kind = PyUnicode_KIND(string);
7870 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007871 length = PyUnicode_GET_LENGTH(string);
7872 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007873 memset(level1, 0xFF, sizeof level1);
7874 memset(level2, 0xFF, sizeof level2);
7875
7876 /* If there isn't a one-to-one mapping of NULL to \0,
7877 or if there are non-BMP characters, we need to use
7878 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007879 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007880 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007881 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007882 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007883 ch = PyUnicode_READ(kind, data, i);
7884 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007885 need_dict = 1;
7886 break;
7887 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007888 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007889 /* unmapped character */
7890 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007891 l1 = ch >> 11;
7892 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007893 if (level1[l1] == 0xFF)
7894 level1[l1] = count2++;
7895 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007896 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007897 }
7898
7899 if (count2 >= 0xFF || count3 >= 0xFF)
7900 need_dict = 1;
7901
7902 if (need_dict) {
7903 PyObject *result = PyDict_New();
7904 PyObject *key, *value;
7905 if (!result)
7906 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007907 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007908 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007909 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007910 if (!key || !value)
7911 goto failed1;
7912 if (PyDict_SetItem(result, key, value) == -1)
7913 goto failed1;
7914 Py_DECREF(key);
7915 Py_DECREF(value);
7916 }
7917 return result;
7918 failed1:
7919 Py_XDECREF(key);
7920 Py_XDECREF(value);
7921 Py_DECREF(result);
7922 return NULL;
7923 }
7924
7925 /* Create a three-level trie */
7926 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7927 16*count2 + 128*count3 - 1);
7928 if (!result)
7929 return PyErr_NoMemory();
7930 PyObject_Init(result, &EncodingMapType);
7931 mresult = (struct encoding_map*)result;
7932 mresult->count2 = count2;
7933 mresult->count3 = count3;
7934 mlevel1 = mresult->level1;
7935 mlevel2 = mresult->level23;
7936 mlevel3 = mresult->level23 + 16*count2;
7937 memcpy(mlevel1, level1, 32);
7938 memset(mlevel2, 0xFF, 16*count2);
7939 memset(mlevel3, 0, 128*count3);
7940 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007941 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007942 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007943 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7944 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007945 /* unmapped character */
7946 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007947 o1 = ch>>11;
7948 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007949 i2 = 16*mlevel1[o1] + o2;
7950 if (mlevel2[i2] == 0xFF)
7951 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007952 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007953 i3 = 128*mlevel2[i2] + o3;
7954 mlevel3[i3] = i;
7955 }
7956 return result;
7957}
7958
7959static int
Victor Stinner22168992011-11-20 17:09:18 +01007960encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007961{
7962 struct encoding_map *map = (struct encoding_map*)mapping;
7963 int l1 = c>>11;
7964 int l2 = (c>>7) & 0xF;
7965 int l3 = c & 0x7F;
7966 int i;
7967
Victor Stinner22168992011-11-20 17:09:18 +01007968 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007969 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007970 if (c == 0)
7971 return 0;
7972 /* level 1*/
7973 i = map->level1[l1];
7974 if (i == 0xFF) {
7975 return -1;
7976 }
7977 /* level 2*/
7978 i = map->level23[16*i+l2];
7979 if (i == 0xFF) {
7980 return -1;
7981 }
7982 /* level 3 */
7983 i = map->level23[16*map->count2 + 128*i + l3];
7984 if (i == 0) {
7985 return -1;
7986 }
7987 return i;
7988}
7989
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007990/* Lookup the character ch in the mapping. If the character
7991 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007992 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007993static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007994charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995{
Christian Heimes217cfd12007-12-02 14:31:20 +00007996 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007997 PyObject *x;
7998
7999 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008000 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008001 x = PyObject_GetItem(mapping, w);
8002 Py_DECREF(w);
8003 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008004 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8005 /* No mapping found means: mapping is undefined. */
8006 PyErr_Clear();
8007 x = Py_None;
8008 Py_INCREF(x);
8009 return x;
8010 } else
8011 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008012 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008013 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008014 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008015 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008016 long value = PyLong_AS_LONG(x);
8017 if (value < 0 || value > 255) {
8018 PyErr_SetString(PyExc_TypeError,
8019 "character mapping must be in range(256)");
8020 Py_DECREF(x);
8021 return NULL;
8022 }
8023 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008024 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008025 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008026 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008027 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008028 /* wrong return value */
8029 PyErr_Format(PyExc_TypeError,
8030 "character mapping must return integer, bytes or None, not %.400s",
8031 x->ob_type->tp_name);
8032 Py_DECREF(x);
8033 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008034 }
8035}
8036
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008037static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008038charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008039{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008040 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8041 /* exponentially overallocate to minimize reallocations */
8042 if (requiredsize < 2*outsize)
8043 requiredsize = 2*outsize;
8044 if (_PyBytes_Resize(outobj, requiredsize))
8045 return -1;
8046 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008047}
8048
Benjamin Peterson14339b62009-01-31 16:36:08 +00008049typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008050 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008051} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008052/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008053 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008054 space is available. Return a new reference to the object that
8055 was put in the output buffer, or Py_None, if the mapping was undefined
8056 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008057 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008058static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008059charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008060 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008061{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008062 PyObject *rep;
8063 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008064 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008065
Christian Heimes90aa7642007-12-19 02:45:37 +00008066 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008067 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008068 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008069 if (res == -1)
8070 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008071 if (outsize<requiredsize)
8072 if (charmapencode_resize(outobj, outpos, requiredsize))
8073 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008074 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008075 outstart[(*outpos)++] = (char)res;
8076 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008077 }
8078
8079 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008080 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008081 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008082 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008083 Py_DECREF(rep);
8084 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008085 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008086 if (PyLong_Check(rep)) {
8087 Py_ssize_t requiredsize = *outpos+1;
8088 if (outsize<requiredsize)
8089 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8090 Py_DECREF(rep);
8091 return enc_EXCEPTION;
8092 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008093 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008094 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008095 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008096 else {
8097 const char *repchars = PyBytes_AS_STRING(rep);
8098 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8099 Py_ssize_t requiredsize = *outpos+repsize;
8100 if (outsize<requiredsize)
8101 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8102 Py_DECREF(rep);
8103 return enc_EXCEPTION;
8104 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008105 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008106 memcpy(outstart + *outpos, repchars, repsize);
8107 *outpos += repsize;
8108 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008109 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008110 Py_DECREF(rep);
8111 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008112}
8113
8114/* handle an error in PyUnicode_EncodeCharmap
8115 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008116static int
8117charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008118 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008119 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008120 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008121 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008122{
8123 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008124 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008125 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008126 enum PyUnicode_Kind kind;
8127 void *data;
8128 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008129 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008130 Py_ssize_t collstartpos = *inpos;
8131 Py_ssize_t collendpos = *inpos+1;
8132 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008133 char *encoding = "charmap";
8134 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008135 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008136 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008137 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008138
Benjamin Petersonbac79492012-01-14 13:34:47 -05008139 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008140 return -1;
8141 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008142 /* find all unencodable characters */
8143 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008144 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008145 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008146 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008147 val = encoding_map_lookup(ch, mapping);
8148 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008149 break;
8150 ++collendpos;
8151 continue;
8152 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008153
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008154 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8155 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008156 if (rep==NULL)
8157 return -1;
8158 else if (rep!=Py_None) {
8159 Py_DECREF(rep);
8160 break;
8161 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008162 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008163 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008164 }
8165 /* cache callback name lookup
8166 * (if not done yet, i.e. it's the first error) */
8167 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008168 if ((errors==NULL) || (!strcmp(errors, "strict")))
8169 *known_errorHandler = 1;
8170 else if (!strcmp(errors, "replace"))
8171 *known_errorHandler = 2;
8172 else if (!strcmp(errors, "ignore"))
8173 *known_errorHandler = 3;
8174 else if (!strcmp(errors, "xmlcharrefreplace"))
8175 *known_errorHandler = 4;
8176 else
8177 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008178 }
8179 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008180 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008181 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008182 return -1;
8183 case 2: /* replace */
8184 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008185 x = charmapencode_output('?', mapping, res, respos);
8186 if (x==enc_EXCEPTION) {
8187 return -1;
8188 }
8189 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008190 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008191 return -1;
8192 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008193 }
8194 /* fall through */
8195 case 3: /* ignore */
8196 *inpos = collendpos;
8197 break;
8198 case 4: /* xmlcharrefreplace */
8199 /* generate replacement (temporarily (mis)uses p) */
8200 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008201 char buffer[2+29+1+1];
8202 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008203 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008204 for (cp = buffer; *cp; ++cp) {
8205 x = charmapencode_output(*cp, mapping, res, respos);
8206 if (x==enc_EXCEPTION)
8207 return -1;
8208 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008209 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008210 return -1;
8211 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008212 }
8213 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008214 *inpos = collendpos;
8215 break;
8216 default:
8217 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008218 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008219 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008220 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008221 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008222 if (PyBytes_Check(repunicode)) {
8223 /* Directly copy bytes result to output. */
8224 Py_ssize_t outsize = PyBytes_Size(*res);
8225 Py_ssize_t requiredsize;
8226 repsize = PyBytes_Size(repunicode);
8227 requiredsize = *respos + repsize;
8228 if (requiredsize > outsize)
8229 /* Make room for all additional bytes. */
8230 if (charmapencode_resize(res, respos, requiredsize)) {
8231 Py_DECREF(repunicode);
8232 return -1;
8233 }
8234 memcpy(PyBytes_AsString(*res) + *respos,
8235 PyBytes_AsString(repunicode), repsize);
8236 *respos += repsize;
8237 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008238 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008239 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008240 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008241 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008242 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008243 Py_DECREF(repunicode);
8244 return -1;
8245 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008246 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008247 data = PyUnicode_DATA(repunicode);
8248 kind = PyUnicode_KIND(repunicode);
8249 for (index = 0; index < repsize; index++) {
8250 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8251 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008252 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008253 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008254 return -1;
8255 }
8256 else if (x==enc_FAILED) {
8257 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008258 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008259 return -1;
8260 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008261 }
8262 *inpos = newpos;
8263 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008264 }
8265 return 0;
8266}
8267
Alexander Belopolsky40018472011-02-26 01:02:56 +00008268PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008269_PyUnicode_EncodeCharmap(PyObject *unicode,
8270 PyObject *mapping,
8271 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008272{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008273 /* output object */
8274 PyObject *res = NULL;
8275 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008276 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008277 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008278 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008279 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008280 PyObject *errorHandler = NULL;
8281 PyObject *exc = NULL;
8282 /* the following variable is used for caching string comparisons
8283 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8284 * 3=ignore, 4=xmlcharrefreplace */
8285 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008286 void *data;
8287 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008288
Benjamin Petersonbac79492012-01-14 13:34:47 -05008289 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008290 return NULL;
8291 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008292 data = PyUnicode_DATA(unicode);
8293 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008294
Guido van Rossumd57fd912000-03-10 22:53:23 +00008295 /* Default to Latin-1 */
8296 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008297 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008299 /* allocate enough for a simple encoding without
8300 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008301 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008302 if (res == NULL)
8303 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008304 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008305 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008307 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008308 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008309 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008310 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 if (x==enc_EXCEPTION) /* error */
8312 goto onError;
8313 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008314 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008315 &exc,
8316 &known_errorHandler, &errorHandler, errors,
8317 &res, &respos)) {
8318 goto onError;
8319 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008320 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008321 else
8322 /* done with this character => adjust input position */
8323 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008325
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008326 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008327 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008328 if (_PyBytes_Resize(&res, respos) < 0)
8329 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008330
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008331 Py_XDECREF(exc);
8332 Py_XDECREF(errorHandler);
8333 return res;
8334
Benjamin Peterson29060642009-01-31 22:14:21 +00008335 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008336 Py_XDECREF(res);
8337 Py_XDECREF(exc);
8338 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008339 return NULL;
8340}
8341
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008342/* Deprecated */
8343PyObject *
8344PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8345 Py_ssize_t size,
8346 PyObject *mapping,
8347 const char *errors)
8348{
8349 PyObject *result;
8350 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8351 if (unicode == NULL)
8352 return NULL;
8353 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8354 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008355 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008356}
8357
Alexander Belopolsky40018472011-02-26 01:02:56 +00008358PyObject *
8359PyUnicode_AsCharmapString(PyObject *unicode,
8360 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008361{
8362 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008363 PyErr_BadArgument();
8364 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008365 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008366 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008367}
8368
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008369/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008370static void
8371make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008372 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008373 Py_ssize_t startpos, Py_ssize_t endpos,
8374 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008375{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008376 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008377 *exceptionObject = _PyUnicodeTranslateError_Create(
8378 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008379 }
8380 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8382 goto onError;
8383 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8384 goto onError;
8385 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8386 goto onError;
8387 return;
8388 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008389 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008390 }
8391}
8392
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008393/* error handling callback helper:
8394 build arguments, call the callback and check the arguments,
8395 put the result into newpos and return the replacement string, which
8396 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008397static PyObject *
8398unicode_translate_call_errorhandler(const char *errors,
8399 PyObject **errorHandler,
8400 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008401 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008402 Py_ssize_t startpos, Py_ssize_t endpos,
8403 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008404{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008405 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008406
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008407 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008408 PyObject *restuple;
8409 PyObject *resunicode;
8410
8411 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008413 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008415 }
8416
8417 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008418 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008419 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008420 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008421
8422 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008424 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008425 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008426 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008427 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008428 Py_DECREF(restuple);
8429 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008430 }
8431 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008432 &resunicode, &i_newpos)) {
8433 Py_DECREF(restuple);
8434 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008435 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008436 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008437 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008438 else
8439 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008440 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008441 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008442 Py_DECREF(restuple);
8443 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008444 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008445 Py_INCREF(resunicode);
8446 Py_DECREF(restuple);
8447 return resunicode;
8448}
8449
8450/* Lookup the character ch in the mapping and put the result in result,
8451 which must be decrefed by the caller.
8452 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008453static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008454charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008455{
Christian Heimes217cfd12007-12-02 14:31:20 +00008456 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008457 PyObject *x;
8458
8459 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008460 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008461 x = PyObject_GetItem(mapping, w);
8462 Py_DECREF(w);
8463 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008464 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8465 /* No mapping found means: use 1:1 mapping. */
8466 PyErr_Clear();
8467 *result = NULL;
8468 return 0;
8469 } else
8470 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008471 }
8472 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008473 *result = x;
8474 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008475 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008476 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008477 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008478 if (value < 0 || value > MAX_UNICODE) {
8479 PyErr_Format(PyExc_ValueError,
8480 "character mapping must be in range(0x%x)",
8481 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008482 Py_DECREF(x);
8483 return -1;
8484 }
8485 *result = x;
8486 return 0;
8487 }
8488 else if (PyUnicode_Check(x)) {
8489 *result = x;
8490 return 0;
8491 }
8492 else {
8493 /* wrong return value */
8494 PyErr_SetString(PyExc_TypeError,
8495 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008496 Py_DECREF(x);
8497 return -1;
8498 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008499}
Victor Stinner1194ea02014-04-04 19:37:40 +02008500
8501/* lookup the character, write the result into the writer.
8502 Return 1 if the result was written into the writer, return 0 if the mapping
8503 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008504static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008505charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8506 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008507{
Victor Stinner1194ea02014-04-04 19:37:40 +02008508 PyObject *item;
8509
8510 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008511 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008512
8513 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008515 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008516 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008517 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008518 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008519 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008520
8521 if (item == Py_None) {
8522 Py_DECREF(item);
8523 return 0;
8524 }
8525
8526 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008527 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8528 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8529 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008530 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8531 Py_DECREF(item);
8532 return -1;
8533 }
8534 Py_DECREF(item);
8535 return 1;
8536 }
8537
8538 if (!PyUnicode_Check(item)) {
8539 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008540 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008541 }
8542
8543 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8544 Py_DECREF(item);
8545 return -1;
8546 }
8547
8548 Py_DECREF(item);
8549 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008550}
8551
Victor Stinner89a76ab2014-04-05 11:44:04 +02008552static int
8553unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8554 Py_UCS1 *translate)
8555{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008556 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008557 int ret = 0;
8558
Victor Stinner89a76ab2014-04-05 11:44:04 +02008559 if (charmaptranslate_lookup(ch, mapping, &item)) {
8560 return -1;
8561 }
8562
8563 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008564 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008565 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008566 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008567 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008568 /* not found => default to 1:1 mapping */
8569 translate[ch] = ch;
8570 return 1;
8571 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008572 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008573 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008574 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8575 used it */
8576 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008577 /* invalid character or character outside ASCII:
8578 skip the fast translate */
8579 goto exit;
8580 }
8581 translate[ch] = (Py_UCS1)replace;
8582 }
8583 else if (PyUnicode_Check(item)) {
8584 Py_UCS4 replace;
8585
8586 if (PyUnicode_READY(item) == -1) {
8587 Py_DECREF(item);
8588 return -1;
8589 }
8590 if (PyUnicode_GET_LENGTH(item) != 1)
8591 goto exit;
8592
8593 replace = PyUnicode_READ_CHAR(item, 0);
8594 if (replace > 127)
8595 goto exit;
8596 translate[ch] = (Py_UCS1)replace;
8597 }
8598 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008599 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008600 goto exit;
8601 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008602 ret = 1;
8603
Benjamin Peterson1365de72014-04-07 20:15:41 -04008604 exit:
8605 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008606 return ret;
8607}
8608
8609/* Fast path for ascii => ascii translation. Return 1 if the whole string
8610 was translated into writer, return 0 if the input string was partially
8611 translated into writer, raise an exception and return -1 on error. */
8612static int
8613unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner872b2912014-04-05 14:27:07 +02008614 _PyUnicodeWriter *writer, int ignore)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008615{
Victor Stinner872b2912014-04-05 14:27:07 +02008616 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008617 Py_ssize_t len;
8618 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008619 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008620
8621 if (PyUnicode_READY(input) == -1)
8622 return -1;
8623 if (!PyUnicode_IS_ASCII(input))
8624 return 0;
8625 len = PyUnicode_GET_LENGTH(input);
8626
Victor Stinner872b2912014-04-05 14:27:07 +02008627 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008628
8629 in = PyUnicode_1BYTE_DATA(input);
8630 end = in + len;
8631
8632 assert(PyUnicode_IS_ASCII(writer->buffer));
8633 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8634 out = PyUnicode_1BYTE_DATA(writer->buffer);
8635
Victor Stinner872b2912014-04-05 14:27:07 +02008636 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008637 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008638 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008639 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008640 int translate = unicode_fast_translate_lookup(mapping, ch,
8641 ascii_table);
8642 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008643 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008644 if (translate == 0)
8645 goto exit;
8646 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008647 }
Victor Stinner872b2912014-04-05 14:27:07 +02008648 if (ch2 == 0xfe) {
8649 if (ignore)
8650 continue;
8651 goto exit;
8652 }
8653 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008654 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008655 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008656 }
Victor Stinner872b2912014-04-05 14:27:07 +02008657 res = 1;
8658
8659exit:
8660 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8661 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008662}
8663
Alexander Belopolsky40018472011-02-26 01:02:56 +00008664PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008665_PyUnicode_TranslateCharmap(PyObject *input,
8666 PyObject *mapping,
8667 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008669 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008670 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008671 Py_ssize_t size, i;
8672 int kind;
8673 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008674 _PyUnicodeWriter writer;
8675 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008676 char *reason = "character maps to <undefined>";
8677 PyObject *errorHandler = NULL;
8678 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008679 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008680 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008681
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008683 PyErr_BadArgument();
8684 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008686
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008687 if (PyUnicode_READY(input) == -1)
8688 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008689 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008690 kind = PyUnicode_KIND(input);
8691 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008692
8693 if (size == 0) {
8694 Py_INCREF(input);
8695 return input;
8696 }
8697
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008698 /* allocate enough for a simple 1:1 translation without
8699 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008700 _PyUnicodeWriter_Init(&writer);
8701 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008702 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008703
Victor Stinner872b2912014-04-05 14:27:07 +02008704 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8705
8706 res = unicode_fast_translate(input, mapping, &writer, ignore);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008707 if (res < 0) {
8708 _PyUnicodeWriter_Dealloc(&writer);
8709 return NULL;
8710 }
8711 if (res == 1)
8712 return _PyUnicodeWriter_Finish(&writer);
8713
Victor Stinner89a76ab2014-04-05 11:44:04 +02008714 i = writer.pos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008715 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008716 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008717 int translate;
8718 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8719 Py_ssize_t newpos;
8720 /* startpos for collecting untranslatable chars */
8721 Py_ssize_t collstart;
8722 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008723 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724
Victor Stinner1194ea02014-04-04 19:37:40 +02008725 ch = PyUnicode_READ(kind, data, i);
8726 translate = charmaptranslate_output(ch, mapping, &writer);
8727 if (translate < 0)
8728 goto onError;
8729
8730 if (translate != 0) {
8731 /* it worked => adjust input pointer */
8732 ++i;
8733 continue;
8734 }
8735
8736 /* untranslatable character */
8737 collstart = i;
8738 collend = i+1;
8739
8740 /* find all untranslatable characters */
8741 while (collend < size) {
8742 PyObject *x;
8743 ch = PyUnicode_READ(kind, data, collend);
8744 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008745 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008746 Py_XDECREF(x);
8747 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008748 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008749 ++collend;
8750 }
8751
8752 if (ignore) {
8753 i = collend;
8754 }
8755 else {
8756 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8757 reason, input, &exc,
8758 collstart, collend, &newpos);
8759 if (repunicode == NULL)
8760 goto onError;
8761 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008762 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008763 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008764 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008765 Py_DECREF(repunicode);
8766 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008767 }
8768 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008769 Py_XDECREF(exc);
8770 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02008771 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008772
Benjamin Peterson29060642009-01-31 22:14:21 +00008773 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02008774 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008775 Py_XDECREF(exc);
8776 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008777 return NULL;
8778}
8779
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008780/* Deprecated. Use PyUnicode_Translate instead. */
8781PyObject *
8782PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8783 Py_ssize_t size,
8784 PyObject *mapping,
8785 const char *errors)
8786{
Christian Heimes5f520f42012-09-11 14:03:25 +02008787 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008788 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8789 if (!unicode)
8790 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008791 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8792 Py_DECREF(unicode);
8793 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008794}
8795
Alexander Belopolsky40018472011-02-26 01:02:56 +00008796PyObject *
8797PyUnicode_Translate(PyObject *str,
8798 PyObject *mapping,
8799 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008800{
8801 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008802
Guido van Rossumd57fd912000-03-10 22:53:23 +00008803 str = PyUnicode_FromObject(str);
8804 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008805 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008806 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008807 Py_DECREF(str);
8808 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008809}
Tim Petersced69f82003-09-16 20:30:58 +00008810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008811static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008812fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008813{
8814 /* No need to call PyUnicode_READY(self) because this function is only
8815 called as a callback from fixup() which does it already. */
8816 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8817 const int kind = PyUnicode_KIND(self);
8818 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008819 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008820 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008821 Py_ssize_t i;
8822
8823 for (i = 0; i < len; ++i) {
8824 ch = PyUnicode_READ(kind, data, i);
8825 fixed = 0;
8826 if (ch > 127) {
8827 if (Py_UNICODE_ISSPACE(ch))
8828 fixed = ' ';
8829 else {
8830 const int decimal = Py_UNICODE_TODECIMAL(ch);
8831 if (decimal >= 0)
8832 fixed = '0' + decimal;
8833 }
8834 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008835 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008836 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008837 PyUnicode_WRITE(kind, data, i, fixed);
8838 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008839 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008840 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008841 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008842 }
8843
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008844 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008845}
8846
8847PyObject *
8848_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8849{
8850 if (!PyUnicode_Check(unicode)) {
8851 PyErr_BadInternalCall();
8852 return NULL;
8853 }
8854 if (PyUnicode_READY(unicode) == -1)
8855 return NULL;
8856 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8857 /* If the string is already ASCII, just return the same string */
8858 Py_INCREF(unicode);
8859 return unicode;
8860 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008861 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008862}
8863
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008864PyObject *
8865PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8866 Py_ssize_t length)
8867{
Victor Stinnerf0124502011-11-21 23:12:56 +01008868 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008869 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008870 Py_UCS4 maxchar;
8871 enum PyUnicode_Kind kind;
8872 void *data;
8873
Victor Stinner99d7ad02012-02-22 13:37:39 +01008874 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008875 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02008876 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008877 if (ch > 127) {
8878 int decimal = Py_UNICODE_TODECIMAL(ch);
8879 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008880 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008881 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008882 }
8883 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008884
8885 /* Copy to a new string */
8886 decimal = PyUnicode_New(length, maxchar);
8887 if (decimal == NULL)
8888 return decimal;
8889 kind = PyUnicode_KIND(decimal);
8890 data = PyUnicode_DATA(decimal);
8891 /* Iterate over code points */
8892 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02008893 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01008894 if (ch > 127) {
8895 int decimal = Py_UNICODE_TODECIMAL(ch);
8896 if (decimal >= 0)
8897 ch = '0' + decimal;
8898 }
8899 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008900 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008901 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008902}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008903/* --- Decimal Encoder ---------------------------------------------------- */
8904
Alexander Belopolsky40018472011-02-26 01:02:56 +00008905int
8906PyUnicode_EncodeDecimal(Py_UNICODE *s,
8907 Py_ssize_t length,
8908 char *output,
8909 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008910{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008911 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008912 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008913 enum PyUnicode_Kind kind;
8914 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008915
8916 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008917 PyErr_BadArgument();
8918 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008919 }
8920
Victor Stinner42bf7752011-11-21 22:52:58 +01008921 unicode = PyUnicode_FromUnicode(s, length);
8922 if (unicode == NULL)
8923 return -1;
8924
Benjamin Petersonbac79492012-01-14 13:34:47 -05008925 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008926 Py_DECREF(unicode);
8927 return -1;
8928 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008929 kind = PyUnicode_KIND(unicode);
8930 data = PyUnicode_DATA(unicode);
8931
Victor Stinnerb84d7232011-11-22 01:50:07 +01008932 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008933 PyObject *exc;
8934 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008935 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008936 Py_ssize_t startpos;
8937
8938 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008939
Benjamin Peterson29060642009-01-31 22:14:21 +00008940 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008941 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008942 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008943 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008944 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008945 decimal = Py_UNICODE_TODECIMAL(ch);
8946 if (decimal >= 0) {
8947 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008948 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008949 continue;
8950 }
8951 if (0 < ch && ch < 256) {
8952 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008953 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008954 continue;
8955 }
Victor Stinner6345be92011-11-25 20:09:01 +01008956
Victor Stinner42bf7752011-11-21 22:52:58 +01008957 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008958 exc = NULL;
8959 raise_encode_exception(&exc, "decimal", unicode,
8960 startpos, startpos+1,
8961 "invalid decimal Unicode string");
8962 Py_XDECREF(exc);
8963 Py_DECREF(unicode);
8964 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008965 }
8966 /* 0-terminate the output string */
8967 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008968 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008969 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008970}
8971
Guido van Rossumd57fd912000-03-10 22:53:23 +00008972/* --- Helpers ------------------------------------------------------------ */
8973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008974static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008975any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008976 Py_ssize_t start,
8977 Py_ssize_t end)
8978{
8979 int kind1, kind2, kind;
8980 void *buf1, *buf2;
8981 Py_ssize_t len1, len2, result;
8982
8983 kind1 = PyUnicode_KIND(s1);
8984 kind2 = PyUnicode_KIND(s2);
8985 kind = kind1 > kind2 ? kind1 : kind2;
8986 buf1 = PyUnicode_DATA(s1);
8987 buf2 = PyUnicode_DATA(s2);
8988 if (kind1 != kind)
8989 buf1 = _PyUnicode_AsKind(s1, kind);
8990 if (!buf1)
8991 return -2;
8992 if (kind2 != kind)
8993 buf2 = _PyUnicode_AsKind(s2, kind);
8994 if (!buf2) {
8995 if (kind1 != kind) PyMem_Free(buf1);
8996 return -2;
8997 }
8998 len1 = PyUnicode_GET_LENGTH(s1);
8999 len2 = PyUnicode_GET_LENGTH(s2);
9000
Victor Stinner794d5672011-10-10 03:21:36 +02009001 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009002 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009003 case PyUnicode_1BYTE_KIND:
9004 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9005 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9006 else
9007 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9008 break;
9009 case PyUnicode_2BYTE_KIND:
9010 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9011 break;
9012 case PyUnicode_4BYTE_KIND:
9013 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9014 break;
9015 default:
9016 assert(0); result = -2;
9017 }
9018 }
9019 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009020 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009021 case PyUnicode_1BYTE_KIND:
9022 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9023 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9024 else
9025 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9026 break;
9027 case PyUnicode_2BYTE_KIND:
9028 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9029 break;
9030 case PyUnicode_4BYTE_KIND:
9031 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9032 break;
9033 default:
9034 assert(0); result = -2;
9035 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009036 }
9037
9038 if (kind1 != kind)
9039 PyMem_Free(buf1);
9040 if (kind2 != kind)
9041 PyMem_Free(buf2);
9042
9043 return result;
9044}
9045
9046Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009047_PyUnicode_InsertThousandsGrouping(
9048 PyObject *unicode, Py_ssize_t index,
9049 Py_ssize_t n_buffer,
9050 void *digits, Py_ssize_t n_digits,
9051 Py_ssize_t min_width,
9052 const char *grouping, PyObject *thousands_sep,
9053 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009054{
Victor Stinner41a863c2012-02-24 00:37:51 +01009055 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009056 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009057 Py_ssize_t thousands_sep_len;
9058 Py_ssize_t len;
9059
9060 if (unicode != NULL) {
9061 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009062 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009063 }
9064 else {
9065 kind = PyUnicode_1BYTE_KIND;
9066 data = NULL;
9067 }
9068 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9069 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9070 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9071 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009072 if (thousands_sep_kind < kind) {
9073 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9074 if (!thousands_sep_data)
9075 return -1;
9076 }
9077 else {
9078 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9079 if (!data)
9080 return -1;
9081 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009082 }
9083
Benjamin Petersonead6b532011-12-20 17:23:42 -06009084 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009085 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009086 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009087 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009088 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009089 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009090 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009091 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009092 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009093 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009094 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009095 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009096 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009097 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009098 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009099 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009100 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009101 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009102 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009103 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009104 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009105 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009106 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009107 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009108 break;
9109 default:
9110 assert(0);
9111 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009112 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009113 if (unicode != NULL && thousands_sep_kind != kind) {
9114 if (thousands_sep_kind < kind)
9115 PyMem_Free(thousands_sep_data);
9116 else
9117 PyMem_Free(data);
9118 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009119 if (unicode == NULL) {
9120 *maxchar = 127;
9121 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009122 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009123 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009124 }
9125 }
9126 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009127}
9128
9129
Thomas Wouters477c8d52006-05-27 19:21:47 +00009130/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009131#define ADJUST_INDICES(start, end, len) \
9132 if (end > len) \
9133 end = len; \
9134 else if (end < 0) { \
9135 end += len; \
9136 if (end < 0) \
9137 end = 0; \
9138 } \
9139 if (start < 0) { \
9140 start += len; \
9141 if (start < 0) \
9142 start = 0; \
9143 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009144
Alexander Belopolsky40018472011-02-26 01:02:56 +00009145Py_ssize_t
9146PyUnicode_Count(PyObject *str,
9147 PyObject *substr,
9148 Py_ssize_t start,
9149 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009150{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009151 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009152 PyObject* str_obj;
9153 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009154 int kind1, kind2, kind;
9155 void *buf1 = NULL, *buf2 = NULL;
9156 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009157
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009158 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009159 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009160 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009161 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009162 if (!sub_obj) {
9163 Py_DECREF(str_obj);
9164 return -1;
9165 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009166 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009167 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009168 Py_DECREF(str_obj);
9169 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009170 }
Tim Petersced69f82003-09-16 20:30:58 +00009171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009172 kind1 = PyUnicode_KIND(str_obj);
9173 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009174 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009175 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009176 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009177 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02009178 if (kind2 > kind) {
9179 Py_DECREF(sub_obj);
9180 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009181 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02009182 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01009183 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009184 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009185 if (!buf2)
9186 goto onError;
9187 len1 = PyUnicode_GET_LENGTH(str_obj);
9188 len2 = PyUnicode_GET_LENGTH(sub_obj);
9189
9190 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009191 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009192 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009193 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9194 result = asciilib_count(
9195 ((Py_UCS1*)buf1) + start, end - start,
9196 buf2, len2, PY_SSIZE_T_MAX
9197 );
9198 else
9199 result = ucs1lib_count(
9200 ((Py_UCS1*)buf1) + start, end - start,
9201 buf2, len2, PY_SSIZE_T_MAX
9202 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009203 break;
9204 case PyUnicode_2BYTE_KIND:
9205 result = ucs2lib_count(
9206 ((Py_UCS2*)buf1) + start, end - start,
9207 buf2, len2, PY_SSIZE_T_MAX
9208 );
9209 break;
9210 case PyUnicode_4BYTE_KIND:
9211 result = ucs4lib_count(
9212 ((Py_UCS4*)buf1) + start, end - start,
9213 buf2, len2, PY_SSIZE_T_MAX
9214 );
9215 break;
9216 default:
9217 assert(0); result = 0;
9218 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009219
9220 Py_DECREF(sub_obj);
9221 Py_DECREF(str_obj);
9222
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009223 if (kind2 != kind)
9224 PyMem_Free(buf2);
9225
Guido van Rossumd57fd912000-03-10 22:53:23 +00009226 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009227 onError:
9228 Py_DECREF(sub_obj);
9229 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009230 if (kind2 != kind && buf2)
9231 PyMem_Free(buf2);
9232 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009233}
9234
Alexander Belopolsky40018472011-02-26 01:02:56 +00009235Py_ssize_t
9236PyUnicode_Find(PyObject *str,
9237 PyObject *sub,
9238 Py_ssize_t start,
9239 Py_ssize_t end,
9240 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009241{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009242 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009243
Guido van Rossumd57fd912000-03-10 22:53:23 +00009244 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009245 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009246 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009247 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009248 if (!sub) {
9249 Py_DECREF(str);
9250 return -2;
9251 }
9252 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9253 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009254 Py_DECREF(str);
9255 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009256 }
Tim Petersced69f82003-09-16 20:30:58 +00009257
Victor Stinner794d5672011-10-10 03:21:36 +02009258 result = any_find_slice(direction,
9259 str, sub, start, end
9260 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009261
Guido van Rossumd57fd912000-03-10 22:53:23 +00009262 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009263 Py_DECREF(sub);
9264
Guido van Rossumd57fd912000-03-10 22:53:23 +00009265 return result;
9266}
9267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009268Py_ssize_t
9269PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9270 Py_ssize_t start, Py_ssize_t end,
9271 int direction)
9272{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009273 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009274 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009275 if (PyUnicode_READY(str) == -1)
9276 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009277 if (start < 0 || end < 0) {
9278 PyErr_SetString(PyExc_IndexError, "string index out of range");
9279 return -2;
9280 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009281 if (end > PyUnicode_GET_LENGTH(str))
9282 end = PyUnicode_GET_LENGTH(str);
9283 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009284 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9285 kind, end-start, ch, direction);
9286 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009287 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009288 else
9289 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009290}
9291
Alexander Belopolsky40018472011-02-26 01:02:56 +00009292static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009293tailmatch(PyObject *self,
9294 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009295 Py_ssize_t start,
9296 Py_ssize_t end,
9297 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009298{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009299 int kind_self;
9300 int kind_sub;
9301 void *data_self;
9302 void *data_sub;
9303 Py_ssize_t offset;
9304 Py_ssize_t i;
9305 Py_ssize_t end_sub;
9306
9307 if (PyUnicode_READY(self) == -1 ||
9308 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009309 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009310
9311 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009312 return 1;
9313
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009314 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9315 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009316 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009317 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009318
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009319 kind_self = PyUnicode_KIND(self);
9320 data_self = PyUnicode_DATA(self);
9321 kind_sub = PyUnicode_KIND(substring);
9322 data_sub = PyUnicode_DATA(substring);
9323 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9324
9325 if (direction > 0)
9326 offset = end;
9327 else
9328 offset = start;
9329
9330 if (PyUnicode_READ(kind_self, data_self, offset) ==
9331 PyUnicode_READ(kind_sub, data_sub, 0) &&
9332 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9333 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9334 /* If both are of the same kind, memcmp is sufficient */
9335 if (kind_self == kind_sub) {
9336 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009337 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009338 data_sub,
9339 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009340 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009341 }
9342 /* otherwise we have to compare each character by first accesing it */
9343 else {
9344 /* We do not need to compare 0 and len(substring)-1 because
9345 the if statement above ensured already that they are equal
9346 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009347 for (i = 1; i < end_sub; ++i) {
9348 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9349 PyUnicode_READ(kind_sub, data_sub, i))
9350 return 0;
9351 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009352 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009353 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009354 }
9355
9356 return 0;
9357}
9358
Alexander Belopolsky40018472011-02-26 01:02:56 +00009359Py_ssize_t
9360PyUnicode_Tailmatch(PyObject *str,
9361 PyObject *substr,
9362 Py_ssize_t start,
9363 Py_ssize_t end,
9364 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009365{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009366 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009367
Guido van Rossumd57fd912000-03-10 22:53:23 +00009368 str = PyUnicode_FromObject(str);
9369 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009370 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009371 substr = PyUnicode_FromObject(substr);
9372 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009373 Py_DECREF(str);
9374 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009375 }
Tim Petersced69f82003-09-16 20:30:58 +00009376
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009377 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009378 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009379 Py_DECREF(str);
9380 Py_DECREF(substr);
9381 return result;
9382}
9383
Guido van Rossumd57fd912000-03-10 22:53:23 +00009384/* Apply fixfct filter to the Unicode object self and return a
9385 reference to the modified object */
9386
Alexander Belopolsky40018472011-02-26 01:02:56 +00009387static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009388fixup(PyObject *self,
9389 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009390{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009391 PyObject *u;
9392 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009393 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009394
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009395 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009396 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009397 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009398 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009399
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009400 /* fix functions return the new maximum character in a string,
9401 if the kind of the resulting unicode object does not change,
9402 everything is fine. Otherwise we need to change the string kind
9403 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009404 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009405
9406 if (maxchar_new == 0) {
9407 /* no changes */;
9408 if (PyUnicode_CheckExact(self)) {
9409 Py_DECREF(u);
9410 Py_INCREF(self);
9411 return self;
9412 }
9413 else
9414 return u;
9415 }
9416
Victor Stinnere6abb482012-05-02 01:15:40 +02009417 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009418
Victor Stinnereaab6042011-12-11 22:22:39 +01009419 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009420 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009421
9422 /* In case the maximum character changed, we need to
9423 convert the string to the new category. */
9424 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9425 if (v == NULL) {
9426 Py_DECREF(u);
9427 return NULL;
9428 }
9429 if (maxchar_new > maxchar_old) {
9430 /* If the maxchar increased so that the kind changed, not all
9431 characters are representable anymore and we need to fix the
9432 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009433 _PyUnicode_FastCopyCharacters(v, 0,
9434 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009435 maxchar_old = fixfct(v);
9436 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009437 }
9438 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009439 _PyUnicode_FastCopyCharacters(v, 0,
9440 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009441 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009442 Py_DECREF(u);
9443 assert(_PyUnicode_CheckConsistency(v, 1));
9444 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009445}
9446
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009447static PyObject *
9448ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009449{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009450 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9451 char *resdata, *data = PyUnicode_DATA(self);
9452 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009453
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009454 res = PyUnicode_New(len, 127);
9455 if (res == NULL)
9456 return NULL;
9457 resdata = PyUnicode_DATA(res);
9458 if (lower)
9459 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009460 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009461 _Py_bytes_upper(resdata, data, len);
9462 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009463}
9464
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009465static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009466handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009467{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009468 Py_ssize_t j;
9469 int final_sigma;
9470 Py_UCS4 c;
9471 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009472
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009473 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9474
9475 where ! is a negation and \p{xxx} is a character with property xxx.
9476 */
9477 for (j = i - 1; j >= 0; j--) {
9478 c = PyUnicode_READ(kind, data, j);
9479 if (!_PyUnicode_IsCaseIgnorable(c))
9480 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009481 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009482 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9483 if (final_sigma) {
9484 for (j = i + 1; j < length; j++) {
9485 c = PyUnicode_READ(kind, data, j);
9486 if (!_PyUnicode_IsCaseIgnorable(c))
9487 break;
9488 }
9489 final_sigma = j == length || !_PyUnicode_IsCased(c);
9490 }
9491 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009492}
9493
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009494static int
9495lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9496 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009497{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009498 /* Obscure special case. */
9499 if (c == 0x3A3) {
9500 mapped[0] = handle_capital_sigma(kind, data, length, i);
9501 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009503 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009504}
9505
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009506static Py_ssize_t
9507do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009508{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009509 Py_ssize_t i, k = 0;
9510 int n_res, j;
9511 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009512
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009513 c = PyUnicode_READ(kind, data, 0);
9514 n_res = _PyUnicode_ToUpperFull(c, mapped);
9515 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009516 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009517 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009518 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009519 for (i = 1; i < length; i++) {
9520 c = PyUnicode_READ(kind, data, i);
9521 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9522 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009523 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009524 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009525 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009526 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009527 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009528}
9529
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009530static Py_ssize_t
9531do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9532 Py_ssize_t i, k = 0;
9533
9534 for (i = 0; i < length; i++) {
9535 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9536 int n_res, j;
9537 if (Py_UNICODE_ISUPPER(c)) {
9538 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9539 }
9540 else if (Py_UNICODE_ISLOWER(c)) {
9541 n_res = _PyUnicode_ToUpperFull(c, mapped);
9542 }
9543 else {
9544 n_res = 1;
9545 mapped[0] = c;
9546 }
9547 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009548 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009549 res[k++] = mapped[j];
9550 }
9551 }
9552 return k;
9553}
9554
9555static Py_ssize_t
9556do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9557 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009558{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009559 Py_ssize_t i, k = 0;
9560
9561 for (i = 0; i < length; i++) {
9562 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9563 int n_res, j;
9564 if (lower)
9565 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9566 else
9567 n_res = _PyUnicode_ToUpperFull(c, mapped);
9568 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009569 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009570 res[k++] = mapped[j];
9571 }
9572 }
9573 return k;
9574}
9575
9576static Py_ssize_t
9577do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9578{
9579 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9580}
9581
9582static Py_ssize_t
9583do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9584{
9585 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9586}
9587
Benjamin Petersone51757f2012-01-12 21:10:29 -05009588static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009589do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9590{
9591 Py_ssize_t i, k = 0;
9592
9593 for (i = 0; i < length; i++) {
9594 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9595 Py_UCS4 mapped[3];
9596 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9597 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009598 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009599 res[k++] = mapped[j];
9600 }
9601 }
9602 return k;
9603}
9604
9605static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009606do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9607{
9608 Py_ssize_t i, k = 0;
9609 int previous_is_cased;
9610
9611 previous_is_cased = 0;
9612 for (i = 0; i < length; i++) {
9613 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9614 Py_UCS4 mapped[3];
9615 int n_res, j;
9616
9617 if (previous_is_cased)
9618 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9619 else
9620 n_res = _PyUnicode_ToTitleFull(c, mapped);
9621
9622 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009623 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009624 res[k++] = mapped[j];
9625 }
9626
9627 previous_is_cased = _PyUnicode_IsCased(c);
9628 }
9629 return k;
9630}
9631
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009632static PyObject *
9633case_operation(PyObject *self,
9634 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9635{
9636 PyObject *res = NULL;
9637 Py_ssize_t length, newlength = 0;
9638 int kind, outkind;
9639 void *data, *outdata;
9640 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9641
Benjamin Petersoneea48462012-01-16 14:28:50 -05009642 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009643
9644 kind = PyUnicode_KIND(self);
9645 data = PyUnicode_DATA(self);
9646 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009647 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009648 PyErr_SetString(PyExc_OverflowError, "string is too long");
9649 return NULL;
9650 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009651 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009652 if (tmp == NULL)
9653 return PyErr_NoMemory();
9654 newlength = perform(kind, data, length, tmp, &maxchar);
9655 res = PyUnicode_New(newlength, maxchar);
9656 if (res == NULL)
9657 goto leave;
9658 tmpend = tmp + newlength;
9659 outdata = PyUnicode_DATA(res);
9660 outkind = PyUnicode_KIND(res);
9661 switch (outkind) {
9662 case PyUnicode_1BYTE_KIND:
9663 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9664 break;
9665 case PyUnicode_2BYTE_KIND:
9666 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9667 break;
9668 case PyUnicode_4BYTE_KIND:
9669 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9670 break;
9671 default:
9672 assert(0);
9673 break;
9674 }
9675 leave:
9676 PyMem_FREE(tmp);
9677 return res;
9678}
9679
Tim Peters8ce9f162004-08-27 01:49:32 +00009680PyObject *
9681PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009682{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009683 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009684 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009685 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009686 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009687 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9688 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009689 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009690 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009691 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009692 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009693 int use_memcpy;
9694 unsigned char *res_data = NULL, *sep_data = NULL;
9695 PyObject *last_obj;
9696 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009697
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009698 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009699 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009700 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009701 }
9702
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009703 /* NOTE: the following code can't call back into Python code,
9704 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009705 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009706
Tim Peters05eba1f2004-08-27 21:32:02 +00009707 seqlen = PySequence_Fast_GET_SIZE(fseq);
9708 /* If empty sequence, return u"". */
9709 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009710 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009711 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009712 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009713
Tim Peters05eba1f2004-08-27 21:32:02 +00009714 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009715 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009716 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009717 if (seqlen == 1) {
9718 if (PyUnicode_CheckExact(items[0])) {
9719 res = items[0];
9720 Py_INCREF(res);
9721 Py_DECREF(fseq);
9722 return res;
9723 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009724 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009725 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009726 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009727 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009728 /* Set up sep and seplen */
9729 if (separator == NULL) {
9730 /* fall back to a blank space separator */
9731 sep = PyUnicode_FromOrdinal(' ');
9732 if (!sep)
9733 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009734 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009735 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009736 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009737 else {
9738 if (!PyUnicode_Check(separator)) {
9739 PyErr_Format(PyExc_TypeError,
9740 "separator: expected str instance,"
9741 " %.80s found",
9742 Py_TYPE(separator)->tp_name);
9743 goto onError;
9744 }
9745 if (PyUnicode_READY(separator))
9746 goto onError;
9747 sep = separator;
9748 seplen = PyUnicode_GET_LENGTH(separator);
9749 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9750 /* inc refcount to keep this code path symmetric with the
9751 above case of a blank separator */
9752 Py_INCREF(sep);
9753 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009754 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009755 }
9756
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009757 /* There are at least two things to join, or else we have a subclass
9758 * of str in the sequence.
9759 * Do a pre-pass to figure out the total amount of space we'll
9760 * need (sz), and see whether all argument are strings.
9761 */
9762 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009763#ifdef Py_DEBUG
9764 use_memcpy = 0;
9765#else
9766 use_memcpy = 1;
9767#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009768 for (i = 0; i < seqlen; i++) {
9769 const Py_ssize_t old_sz = sz;
9770 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009771 if (!PyUnicode_Check(item)) {
9772 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009773 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009774 " %.80s found",
9775 i, Py_TYPE(item)->tp_name);
9776 goto onError;
9777 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009778 if (PyUnicode_READY(item) == -1)
9779 goto onError;
9780 sz += PyUnicode_GET_LENGTH(item);
9781 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009782 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009783 if (i != 0)
9784 sz += seplen;
9785 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9786 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009787 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009788 goto onError;
9789 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009790 if (use_memcpy && last_obj != NULL) {
9791 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9792 use_memcpy = 0;
9793 }
9794 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009795 }
Tim Petersced69f82003-09-16 20:30:58 +00009796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009797 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009798 if (res == NULL)
9799 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009800
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009801 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009802#ifdef Py_DEBUG
9803 use_memcpy = 0;
9804#else
9805 if (use_memcpy) {
9806 res_data = PyUnicode_1BYTE_DATA(res);
9807 kind = PyUnicode_KIND(res);
9808 if (seplen != 0)
9809 sep_data = PyUnicode_1BYTE_DATA(sep);
9810 }
9811#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009812 if (use_memcpy) {
9813 for (i = 0; i < seqlen; ++i) {
9814 Py_ssize_t itemlen;
9815 item = items[i];
9816
9817 /* Copy item, and maybe the separator. */
9818 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009819 Py_MEMCPY(res_data,
9820 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009821 kind * seplen);
9822 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009823 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009824
9825 itemlen = PyUnicode_GET_LENGTH(item);
9826 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009827 Py_MEMCPY(res_data,
9828 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009829 kind * itemlen);
9830 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009831 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009832 }
9833 assert(res_data == PyUnicode_1BYTE_DATA(res)
9834 + kind * PyUnicode_GET_LENGTH(res));
9835 }
9836 else {
9837 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9838 Py_ssize_t itemlen;
9839 item = items[i];
9840
9841 /* Copy item, and maybe the separator. */
9842 if (i && seplen != 0) {
9843 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9844 res_offset += seplen;
9845 }
9846
9847 itemlen = PyUnicode_GET_LENGTH(item);
9848 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009849 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009850 res_offset += itemlen;
9851 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009852 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009853 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009854 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009855
Tim Peters05eba1f2004-08-27 21:32:02 +00009856 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009857 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009858 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009859 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009860
Benjamin Peterson29060642009-01-31 22:14:21 +00009861 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009862 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009863 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009864 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009865 return NULL;
9866}
9867
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009868#define FILL(kind, data, value, start, length) \
9869 do { \
9870 Py_ssize_t i_ = 0; \
9871 assert(kind != PyUnicode_WCHAR_KIND); \
9872 switch ((kind)) { \
9873 case PyUnicode_1BYTE_KIND: { \
9874 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009875 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009876 break; \
9877 } \
9878 case PyUnicode_2BYTE_KIND: { \
9879 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9880 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9881 break; \
9882 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009883 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009884 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9885 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9886 break; \
9887 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +02009888 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009889 } \
9890 } while (0)
9891
Victor Stinnerd3f08822012-05-29 12:57:52 +02009892void
9893_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9894 Py_UCS4 fill_char)
9895{
9896 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9897 const void *data = PyUnicode_DATA(unicode);
9898 assert(PyUnicode_IS_READY(unicode));
9899 assert(unicode_modifiable(unicode));
9900 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9901 assert(start >= 0);
9902 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9903 FILL(kind, data, fill_char, start, length);
9904}
9905
Victor Stinner3fe55312012-01-04 00:33:50 +01009906Py_ssize_t
9907PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9908 Py_UCS4 fill_char)
9909{
9910 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009911
9912 if (!PyUnicode_Check(unicode)) {
9913 PyErr_BadInternalCall();
9914 return -1;
9915 }
9916 if (PyUnicode_READY(unicode) == -1)
9917 return -1;
9918 if (unicode_check_modifiable(unicode))
9919 return -1;
9920
Victor Stinnerd3f08822012-05-29 12:57:52 +02009921 if (start < 0) {
9922 PyErr_SetString(PyExc_IndexError, "string index out of range");
9923 return -1;
9924 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009925 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9926 PyErr_SetString(PyExc_ValueError,
9927 "fill character is bigger than "
9928 "the string maximum character");
9929 return -1;
9930 }
9931
9932 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9933 length = Py_MIN(maxlen, length);
9934 if (length <= 0)
9935 return 0;
9936
Victor Stinnerd3f08822012-05-29 12:57:52 +02009937 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009938 return length;
9939}
9940
Victor Stinner9310abb2011-10-05 00:59:23 +02009941static PyObject *
9942pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009943 Py_ssize_t left,
9944 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009945 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009946{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009947 PyObject *u;
9948 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009949 int kind;
9950 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009951
9952 if (left < 0)
9953 left = 0;
9954 if (right < 0)
9955 right = 0;
9956
Victor Stinnerc4b49542011-12-11 22:44:26 +01009957 if (left == 0 && right == 0)
9958 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009959
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009960 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9961 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009962 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9963 return NULL;
9964 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009965 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009966 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009967 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009968 if (!u)
9969 return NULL;
9970
9971 kind = PyUnicode_KIND(u);
9972 data = PyUnicode_DATA(u);
9973 if (left)
9974 FILL(kind, data, fill, 0, left);
9975 if (right)
9976 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009977 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009978 assert(_PyUnicode_CheckConsistency(u, 1));
9979 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009980}
9981
Alexander Belopolsky40018472011-02-26 01:02:56 +00009982PyObject *
9983PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009984{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009985 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009986
9987 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009988 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009989 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009990 if (PyUnicode_READY(string) == -1) {
9991 Py_DECREF(string);
9992 return NULL;
9993 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009994
Benjamin Petersonead6b532011-12-20 17:23:42 -06009995 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009996 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009997 if (PyUnicode_IS_ASCII(string))
9998 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009999 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010000 PyUnicode_GET_LENGTH(string), keepends);
10001 else
10002 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010003 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010004 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010005 break;
10006 case PyUnicode_2BYTE_KIND:
10007 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010008 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009 PyUnicode_GET_LENGTH(string), keepends);
10010 break;
10011 case PyUnicode_4BYTE_KIND:
10012 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010013 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010014 PyUnicode_GET_LENGTH(string), keepends);
10015 break;
10016 default:
10017 assert(0);
10018 list = 0;
10019 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010020 Py_DECREF(string);
10021 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010022}
10023
Alexander Belopolsky40018472011-02-26 01:02:56 +000010024static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010025split(PyObject *self,
10026 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010027 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010028{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010029 int kind1, kind2, kind;
10030 void *buf1, *buf2;
10031 Py_ssize_t len1, len2;
10032 PyObject* out;
10033
Guido van Rossumd57fd912000-03-10 22:53:23 +000010034 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010035 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010036
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010037 if (PyUnicode_READY(self) == -1)
10038 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010039
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010040 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010041 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010042 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010043 if (PyUnicode_IS_ASCII(self))
10044 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010045 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010046 PyUnicode_GET_LENGTH(self), maxcount
10047 );
10048 else
10049 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010050 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010051 PyUnicode_GET_LENGTH(self), maxcount
10052 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010053 case PyUnicode_2BYTE_KIND:
10054 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010055 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010056 PyUnicode_GET_LENGTH(self), maxcount
10057 );
10058 case PyUnicode_4BYTE_KIND:
10059 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010060 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 PyUnicode_GET_LENGTH(self), maxcount
10062 );
10063 default:
10064 assert(0);
10065 return NULL;
10066 }
10067
10068 if (PyUnicode_READY(substring) == -1)
10069 return NULL;
10070
10071 kind1 = PyUnicode_KIND(self);
10072 kind2 = PyUnicode_KIND(substring);
10073 kind = kind1 > kind2 ? kind1 : kind2;
10074 buf1 = PyUnicode_DATA(self);
10075 buf2 = PyUnicode_DATA(substring);
10076 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010077 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010078 if (!buf1)
10079 return NULL;
10080 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010081 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010082 if (!buf2) {
10083 if (kind1 != kind) PyMem_Free(buf1);
10084 return NULL;
10085 }
10086 len1 = PyUnicode_GET_LENGTH(self);
10087 len2 = PyUnicode_GET_LENGTH(substring);
10088
Benjamin Petersonead6b532011-12-20 17:23:42 -060010089 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010090 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010091 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10092 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010093 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010094 else
10095 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010096 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 break;
10098 case PyUnicode_2BYTE_KIND:
10099 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010100 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010101 break;
10102 case PyUnicode_4BYTE_KIND:
10103 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010104 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010105 break;
10106 default:
10107 out = NULL;
10108 }
10109 if (kind1 != kind)
10110 PyMem_Free(buf1);
10111 if (kind2 != kind)
10112 PyMem_Free(buf2);
10113 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010114}
10115
Alexander Belopolsky40018472011-02-26 01:02:56 +000010116static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010117rsplit(PyObject *self,
10118 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010119 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010120{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010121 int kind1, kind2, kind;
10122 void *buf1, *buf2;
10123 Py_ssize_t len1, len2;
10124 PyObject* out;
10125
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010126 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010127 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010129 if (PyUnicode_READY(self) == -1)
10130 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010132 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010133 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010134 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010135 if (PyUnicode_IS_ASCII(self))
10136 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010137 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010138 PyUnicode_GET_LENGTH(self), maxcount
10139 );
10140 else
10141 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010142 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010143 PyUnicode_GET_LENGTH(self), maxcount
10144 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010145 case PyUnicode_2BYTE_KIND:
10146 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010147 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010148 PyUnicode_GET_LENGTH(self), maxcount
10149 );
10150 case PyUnicode_4BYTE_KIND:
10151 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010152 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010153 PyUnicode_GET_LENGTH(self), maxcount
10154 );
10155 default:
10156 assert(0);
10157 return NULL;
10158 }
10159
10160 if (PyUnicode_READY(substring) == -1)
10161 return NULL;
10162
10163 kind1 = PyUnicode_KIND(self);
10164 kind2 = PyUnicode_KIND(substring);
10165 kind = kind1 > kind2 ? kind1 : kind2;
10166 buf1 = PyUnicode_DATA(self);
10167 buf2 = PyUnicode_DATA(substring);
10168 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010169 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010170 if (!buf1)
10171 return NULL;
10172 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010173 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 if (!buf2) {
10175 if (kind1 != kind) PyMem_Free(buf1);
10176 return NULL;
10177 }
10178 len1 = PyUnicode_GET_LENGTH(self);
10179 len2 = PyUnicode_GET_LENGTH(substring);
10180
Benjamin Petersonead6b532011-12-20 17:23:42 -060010181 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010183 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10184 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010185 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010186 else
10187 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010188 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189 break;
10190 case PyUnicode_2BYTE_KIND:
10191 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010192 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010193 break;
10194 case PyUnicode_4BYTE_KIND:
10195 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010196 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010197 break;
10198 default:
10199 out = NULL;
10200 }
10201 if (kind1 != kind)
10202 PyMem_Free(buf1);
10203 if (kind2 != kind)
10204 PyMem_Free(buf2);
10205 return out;
10206}
10207
10208static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010209anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10210 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010211{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010212 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010213 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010214 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10215 return asciilib_find(buf1, len1, buf2, len2, offset);
10216 else
10217 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010218 case PyUnicode_2BYTE_KIND:
10219 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10220 case PyUnicode_4BYTE_KIND:
10221 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10222 }
10223 assert(0);
10224 return -1;
10225}
10226
10227static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010228anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10229 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010230{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010231 switch (kind) {
10232 case PyUnicode_1BYTE_KIND:
10233 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10234 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10235 else
10236 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10237 case PyUnicode_2BYTE_KIND:
10238 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10239 case PyUnicode_4BYTE_KIND:
10240 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10241 }
10242 assert(0);
10243 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010244}
10245
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010246static void
10247replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10248 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10249{
10250 int kind = PyUnicode_KIND(u);
10251 void *data = PyUnicode_DATA(u);
10252 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10253 if (kind == PyUnicode_1BYTE_KIND) {
10254 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10255 (Py_UCS1 *)data + len,
10256 u1, u2, maxcount);
10257 }
10258 else if (kind == PyUnicode_2BYTE_KIND) {
10259 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10260 (Py_UCS2 *)data + len,
10261 u1, u2, maxcount);
10262 }
10263 else {
10264 assert(kind == PyUnicode_4BYTE_KIND);
10265 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10266 (Py_UCS4 *)data + len,
10267 u1, u2, maxcount);
10268 }
10269}
10270
Alexander Belopolsky40018472011-02-26 01:02:56 +000010271static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010272replace(PyObject *self, PyObject *str1,
10273 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010274{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010275 PyObject *u;
10276 char *sbuf = PyUnicode_DATA(self);
10277 char *buf1 = PyUnicode_DATA(str1);
10278 char *buf2 = PyUnicode_DATA(str2);
10279 int srelease = 0, release1 = 0, release2 = 0;
10280 int skind = PyUnicode_KIND(self);
10281 int kind1 = PyUnicode_KIND(str1);
10282 int kind2 = PyUnicode_KIND(str2);
10283 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10284 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10285 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010286 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010287 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010288
10289 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010290 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010291 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010292 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010293
Victor Stinner59de0ee2011-10-07 10:01:28 +020010294 if (str1 == str2)
10295 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296
Victor Stinner49a0a212011-10-12 23:46:10 +020010297 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010298 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10299 if (maxchar < maxchar_str1)
10300 /* substring too wide to be present */
10301 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010302 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10303 /* Replacing str1 with str2 may cause a maxchar reduction in the
10304 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010305 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010306 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010307
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010308 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010309 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010311 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010313 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010314 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010315 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010316
Victor Stinner69ed0f42013-04-09 21:48:24 +020010317 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010318 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010319 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010320 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010321 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010323 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010324 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010325
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010326 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10327 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010328 }
10329 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330 int rkind = skind;
10331 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010332 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 if (kind1 < rkind) {
10335 /* widen substring */
10336 buf1 = _PyUnicode_AsKind(str1, rkind);
10337 if (!buf1) goto error;
10338 release1 = 1;
10339 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010340 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010341 if (i < 0)
10342 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010343 if (rkind > kind2) {
10344 /* widen replacement */
10345 buf2 = _PyUnicode_AsKind(str2, rkind);
10346 if (!buf2) goto error;
10347 release2 = 1;
10348 }
10349 else if (rkind < kind2) {
10350 /* widen self and buf1 */
10351 rkind = kind2;
10352 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010353 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 sbuf = _PyUnicode_AsKind(self, rkind);
10355 if (!sbuf) goto error;
10356 srelease = 1;
10357 buf1 = _PyUnicode_AsKind(str1, rkind);
10358 if (!buf1) goto error;
10359 release1 = 1;
10360 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010361 u = PyUnicode_New(slen, maxchar);
10362 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010363 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010364 assert(PyUnicode_KIND(u) == rkind);
10365 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010366
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010367 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010368 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010369 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010370 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010371 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010372 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010373
10374 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010375 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010376 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010377 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010378 if (i == -1)
10379 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010380 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010381 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010382 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010384 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010385 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010386 }
10387 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010388 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010389 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010390 int rkind = skind;
10391 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010392
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010393 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010394 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395 buf1 = _PyUnicode_AsKind(str1, rkind);
10396 if (!buf1) goto error;
10397 release1 = 1;
10398 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010399 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010400 if (n == 0)
10401 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010402 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010403 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404 buf2 = _PyUnicode_AsKind(str2, rkind);
10405 if (!buf2) goto error;
10406 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010407 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010409 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010410 rkind = kind2;
10411 sbuf = _PyUnicode_AsKind(self, rkind);
10412 if (!sbuf) goto error;
10413 srelease = 1;
10414 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010415 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 buf1 = _PyUnicode_AsKind(str1, rkind);
10417 if (!buf1) goto error;
10418 release1 = 1;
10419 }
10420 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10421 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010422 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010423 PyErr_SetString(PyExc_OverflowError,
10424 "replace string is too long");
10425 goto error;
10426 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010427 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010428 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010429 _Py_INCREF_UNICODE_EMPTY();
10430 if (!unicode_empty)
10431 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010432 u = unicode_empty;
10433 goto done;
10434 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010435 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436 PyErr_SetString(PyExc_OverflowError,
10437 "replace string is too long");
10438 goto error;
10439 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010440 u = PyUnicode_New(new_size, maxchar);
10441 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010442 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010443 assert(PyUnicode_KIND(u) == rkind);
10444 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010445 ires = i = 0;
10446 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010447 while (n-- > 0) {
10448 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010449 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010450 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010451 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010452 if (j == -1)
10453 break;
10454 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010455 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010456 memcpy(res + rkind * ires,
10457 sbuf + rkind * i,
10458 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010459 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010460 }
10461 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010463 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010465 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010467 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010468 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010469 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010470 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010471 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010472 memcpy(res + rkind * ires,
10473 sbuf + rkind * i,
10474 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010475 }
10476 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010477 /* interleave */
10478 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010479 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010480 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010481 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010483 if (--n <= 0)
10484 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010485 memcpy(res + rkind * ires,
10486 sbuf + rkind * i,
10487 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010488 ires++;
10489 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010490 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010491 memcpy(res + rkind * ires,
10492 sbuf + rkind * i,
10493 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010494 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010495 }
10496
10497 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010498 unicode_adjust_maxchar(&u);
10499 if (u == NULL)
10500 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010501 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010502
10503 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010504 if (srelease)
10505 PyMem_FREE(sbuf);
10506 if (release1)
10507 PyMem_FREE(buf1);
10508 if (release2)
10509 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010510 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010512
Benjamin Peterson29060642009-01-31 22:14:21 +000010513 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010514 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010515 if (srelease)
10516 PyMem_FREE(sbuf);
10517 if (release1)
10518 PyMem_FREE(buf1);
10519 if (release2)
10520 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010521 return unicode_result_unchanged(self);
10522
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 error:
10524 if (srelease && sbuf)
10525 PyMem_FREE(sbuf);
10526 if (release1 && buf1)
10527 PyMem_FREE(buf1);
10528 if (release2 && buf2)
10529 PyMem_FREE(buf2);
10530 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010531}
10532
10533/* --- Unicode Object Methods --------------------------------------------- */
10534
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010535PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010536 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010537\n\
10538Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010539characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010540
10541static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010542unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010543{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010544 if (PyUnicode_READY(self) == -1)
10545 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010546 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010547}
10548
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010549PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010550 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010551\n\
10552Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010553have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010554
10555static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010556unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010557{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010558 if (PyUnicode_READY(self) == -1)
10559 return NULL;
10560 if (PyUnicode_GET_LENGTH(self) == 0)
10561 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010562 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010563}
10564
Benjamin Petersond5890c82012-01-14 13:23:30 -050010565PyDoc_STRVAR(casefold__doc__,
10566 "S.casefold() -> str\n\
10567\n\
10568Return a version of S suitable for caseless comparisons.");
10569
10570static PyObject *
10571unicode_casefold(PyObject *self)
10572{
10573 if (PyUnicode_READY(self) == -1)
10574 return NULL;
10575 if (PyUnicode_IS_ASCII(self))
10576 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010577 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010578}
10579
10580
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010581/* Argument converter. Coerces to a single unicode character */
10582
10583static int
10584convert_uc(PyObject *obj, void *addr)
10585{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010586 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010587 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010588
Benjamin Peterson14339b62009-01-31 16:36:08 +000010589 uniobj = PyUnicode_FromObject(obj);
10590 if (uniobj == NULL) {
10591 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010592 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010593 return 0;
10594 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010596 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010597 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010598 Py_DECREF(uniobj);
10599 return 0;
10600 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010602 Py_DECREF(uniobj);
10603 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010604}
10605
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010606PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010607 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010608\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010609Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010610done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611
10612static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010613unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010614{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010615 Py_ssize_t marg, left;
10616 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 Py_UCS4 fillchar = ' ';
10618
Victor Stinnere9a29352011-10-01 02:14:59 +020010619 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010621
Benjamin Petersonbac79492012-01-14 13:34:47 -050010622 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010623 return NULL;
10624
Victor Stinnerc4b49542011-12-11 22:44:26 +010010625 if (PyUnicode_GET_LENGTH(self) >= width)
10626 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010627
Victor Stinnerc4b49542011-12-11 22:44:26 +010010628 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010629 left = marg / 2 + (marg & width & 1);
10630
Victor Stinner9310abb2011-10-05 00:59:23 +020010631 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010632}
10633
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634/* This function assumes that str1 and str2 are readied by the caller. */
10635
Marc-André Lemburge5034372000-08-08 08:04:29 +000010636static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010637unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010638{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010639#define COMPARE(TYPE1, TYPE2) \
10640 do { \
10641 TYPE1* p1 = (TYPE1 *)data1; \
10642 TYPE2* p2 = (TYPE2 *)data2; \
10643 TYPE1* end = p1 + len; \
10644 Py_UCS4 c1, c2; \
10645 for (; p1 != end; p1++, p2++) { \
10646 c1 = *p1; \
10647 c2 = *p2; \
10648 if (c1 != c2) \
10649 return (c1 < c2) ? -1 : 1; \
10650 } \
10651 } \
10652 while (0)
10653
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 int kind1, kind2;
10655 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010656 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010657
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010658 kind1 = PyUnicode_KIND(str1);
10659 kind2 = PyUnicode_KIND(str2);
10660 data1 = PyUnicode_DATA(str1);
10661 data2 = PyUnicode_DATA(str2);
10662 len1 = PyUnicode_GET_LENGTH(str1);
10663 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010664 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010665
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010666 switch(kind1) {
10667 case PyUnicode_1BYTE_KIND:
10668 {
10669 switch(kind2) {
10670 case PyUnicode_1BYTE_KIND:
10671 {
10672 int cmp = memcmp(data1, data2, len);
10673 /* normalize result of memcmp() into the range [-1; 1] */
10674 if (cmp < 0)
10675 return -1;
10676 if (cmp > 0)
10677 return 1;
10678 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010679 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010680 case PyUnicode_2BYTE_KIND:
10681 COMPARE(Py_UCS1, Py_UCS2);
10682 break;
10683 case PyUnicode_4BYTE_KIND:
10684 COMPARE(Py_UCS1, Py_UCS4);
10685 break;
10686 default:
10687 assert(0);
10688 }
10689 break;
10690 }
10691 case PyUnicode_2BYTE_KIND:
10692 {
10693 switch(kind2) {
10694 case PyUnicode_1BYTE_KIND:
10695 COMPARE(Py_UCS2, Py_UCS1);
10696 break;
10697 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010698 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010699 COMPARE(Py_UCS2, Py_UCS2);
10700 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010701 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010702 case PyUnicode_4BYTE_KIND:
10703 COMPARE(Py_UCS2, Py_UCS4);
10704 break;
10705 default:
10706 assert(0);
10707 }
10708 break;
10709 }
10710 case PyUnicode_4BYTE_KIND:
10711 {
10712 switch(kind2) {
10713 case PyUnicode_1BYTE_KIND:
10714 COMPARE(Py_UCS4, Py_UCS1);
10715 break;
10716 case PyUnicode_2BYTE_KIND:
10717 COMPARE(Py_UCS4, Py_UCS2);
10718 break;
10719 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010720 {
10721#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10722 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10723 /* normalize result of wmemcmp() into the range [-1; 1] */
10724 if (cmp < 0)
10725 return -1;
10726 if (cmp > 0)
10727 return 1;
10728#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010729 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010730#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010731 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010732 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010733 default:
10734 assert(0);
10735 }
10736 break;
10737 }
10738 default:
10739 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010740 }
10741
Victor Stinner770e19e2012-10-04 22:59:45 +020010742 if (len1 == len2)
10743 return 0;
10744 if (len1 < len2)
10745 return -1;
10746 else
10747 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010748
10749#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010750}
10751
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010752Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010753unicode_compare_eq(PyObject *str1, PyObject *str2)
10754{
10755 int kind;
10756 void *data1, *data2;
10757 Py_ssize_t len;
10758 int cmp;
10759
Victor Stinnere5567ad2012-10-23 02:48:49 +020010760 len = PyUnicode_GET_LENGTH(str1);
10761 if (PyUnicode_GET_LENGTH(str2) != len)
10762 return 0;
10763 kind = PyUnicode_KIND(str1);
10764 if (PyUnicode_KIND(str2) != kind)
10765 return 0;
10766 data1 = PyUnicode_DATA(str1);
10767 data2 = PyUnicode_DATA(str2);
10768
10769 cmp = memcmp(data1, data2, len * kind);
10770 return (cmp == 0);
10771}
10772
10773
Alexander Belopolsky40018472011-02-26 01:02:56 +000010774int
10775PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010776{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010777 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10778 if (PyUnicode_READY(left) == -1 ||
10779 PyUnicode_READY(right) == -1)
10780 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010781
10782 /* a string is equal to itself */
10783 if (left == right)
10784 return 0;
10785
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010786 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010787 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010788 PyErr_Format(PyExc_TypeError,
10789 "Can't compare %.100s and %.100s",
10790 left->ob_type->tp_name,
10791 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010792 return -1;
10793}
10794
Martin v. Löwis5b222132007-06-10 09:51:05 +000010795int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010796_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10797{
10798 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10799 if (right_str == NULL)
10800 return -1;
10801 return PyUnicode_Compare(left, right_str);
10802}
10803
10804int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010805PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10806{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010807 Py_ssize_t i;
10808 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010809 Py_UCS4 chr;
10810
Victor Stinner910337b2011-10-03 03:20:16 +020010811 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010812 if (PyUnicode_READY(uni) == -1)
10813 return -1;
10814 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010815 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010816 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010817 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010818 size_t len, len2 = strlen(str);
10819 int cmp;
10820
10821 len = Py_MIN(len1, len2);
10822 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010823 if (cmp != 0) {
10824 if (cmp < 0)
10825 return -1;
10826 else
10827 return 1;
10828 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010829 if (len1 > len2)
10830 return 1; /* uni is longer */
10831 if (len2 > len1)
10832 return -1; /* str is longer */
10833 return 0;
10834 }
10835 else {
10836 void *data = PyUnicode_DATA(uni);
10837 /* Compare Unicode string and source character set string */
10838 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020010839 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010010840 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10841 /* This check keeps Python strings that end in '\0' from comparing equal
10842 to C strings identical up to that point. */
10843 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10844 return 1; /* uni is longer */
10845 if (str[i])
10846 return -1; /* str is longer */
10847 return 0;
10848 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010849}
10850
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010851
Benjamin Peterson29060642009-01-31 22:14:21 +000010852#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010853 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010854
Alexander Belopolsky40018472011-02-26 01:02:56 +000010855PyObject *
10856PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010857{
10858 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010859 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010860
Victor Stinnere5567ad2012-10-23 02:48:49 +020010861 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10862 Py_RETURN_NOTIMPLEMENTED;
10863
10864 if (PyUnicode_READY(left) == -1 ||
10865 PyUnicode_READY(right) == -1)
10866 return NULL;
10867
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010010868 if (left == right) {
10869 switch (op) {
10870 case Py_EQ:
10871 case Py_LE:
10872 case Py_GE:
10873 /* a string is equal to itself */
10874 v = Py_True;
10875 break;
10876 case Py_NE:
10877 case Py_LT:
10878 case Py_GT:
10879 v = Py_False;
10880 break;
10881 default:
10882 PyErr_BadArgument();
10883 return NULL;
10884 }
10885 }
10886 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020010887 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010888 result ^= (op == Py_NE);
10889 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020010890 }
10891 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010892 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010893
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010894 /* Convert the return value to a Boolean */
10895 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010896 case Py_LE:
10897 v = TEST_COND(result <= 0);
10898 break;
10899 case Py_GE:
10900 v = TEST_COND(result >= 0);
10901 break;
10902 case Py_LT:
10903 v = TEST_COND(result == -1);
10904 break;
10905 case Py_GT:
10906 v = TEST_COND(result == 1);
10907 break;
10908 default:
10909 PyErr_BadArgument();
10910 return NULL;
10911 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010912 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010913 Py_INCREF(v);
10914 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010915}
10916
Alexander Belopolsky40018472011-02-26 01:02:56 +000010917int
10918PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010919{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010920 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020010921 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010922 void *buf1, *buf2;
10923 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010924 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010925
10926 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010927 sub = PyUnicode_FromObject(element);
10928 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010929 PyErr_Format(PyExc_TypeError,
10930 "'in <string>' requires string as left operand, not %s",
10931 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010932 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010933 }
10934
Thomas Wouters477c8d52006-05-27 19:21:47 +000010935 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010936 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010937 Py_DECREF(sub);
10938 return -1;
10939 }
10940
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010941 kind1 = PyUnicode_KIND(str);
10942 kind2 = PyUnicode_KIND(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010943 buf1 = PyUnicode_DATA(str);
10944 buf2 = PyUnicode_DATA(sub);
Victor Stinner77282cb2013-04-14 19:22:47 +020010945 if (kind2 != kind1) {
10946 if (kind2 > kind1) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010947 Py_DECREF(sub);
10948 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010949 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010950 }
Victor Stinner77282cb2013-04-14 19:22:47 +020010951 buf2 = _PyUnicode_AsKind(sub, kind1);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010952 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010953 if (!buf2) {
10954 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010955 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010956 return -1;
10957 }
10958 len1 = PyUnicode_GET_LENGTH(str);
10959 len2 = PyUnicode_GET_LENGTH(sub);
10960
Victor Stinner77282cb2013-04-14 19:22:47 +020010961 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010962 case PyUnicode_1BYTE_KIND:
10963 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10964 break;
10965 case PyUnicode_2BYTE_KIND:
10966 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10967 break;
10968 case PyUnicode_4BYTE_KIND:
10969 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10970 break;
10971 default:
10972 result = -1;
10973 assert(0);
10974 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010975
10976 Py_DECREF(str);
10977 Py_DECREF(sub);
10978
Victor Stinner77282cb2013-04-14 19:22:47 +020010979 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010980 PyMem_Free(buf2);
10981
Guido van Rossum403d68b2000-03-13 15:55:09 +000010982 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010983}
10984
Guido van Rossumd57fd912000-03-10 22:53:23 +000010985/* Concat to string or Unicode object giving a new Unicode object. */
10986
Alexander Belopolsky40018472011-02-26 01:02:56 +000010987PyObject *
10988PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010989{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010990 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010991 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010992 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010993
10994 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010995 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010996 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010997 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010998 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010999 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011000 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011001
11002 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020011003 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011004 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011006 }
Victor Stinnera464fc12011-10-02 20:39:30 +020011007 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011008 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011009 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011010 }
11011
Victor Stinner488fa492011-12-12 00:01:39 +010011012 u_len = PyUnicode_GET_LENGTH(u);
11013 v_len = PyUnicode_GET_LENGTH(v);
11014 if (u_len > PY_SSIZE_T_MAX - v_len) {
11015 PyErr_SetString(PyExc_OverflowError,
11016 "strings are too large to concat");
11017 goto onError;
11018 }
11019 new_len = u_len + v_len;
11020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011021 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011022 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011023 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011024
Guido van Rossumd57fd912000-03-10 22:53:23 +000011025 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011026 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011027 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011028 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011029 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
11030 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011031 Py_DECREF(u);
11032 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011033 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011034 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011035
Benjamin Peterson29060642009-01-31 22:14:21 +000011036 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011037 Py_XDECREF(u);
11038 Py_XDECREF(v);
11039 return NULL;
11040}
11041
Walter Dörwald1ab83302007-05-18 17:15:44 +000011042void
Victor Stinner23e56682011-10-03 03:54:37 +020011043PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011044{
Victor Stinner23e56682011-10-03 03:54:37 +020011045 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011046 Py_UCS4 maxchar, maxchar2;
11047 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011048
11049 if (p_left == NULL) {
11050 if (!PyErr_Occurred())
11051 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011052 return;
11053 }
Victor Stinner23e56682011-10-03 03:54:37 +020011054 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011055 if (right == NULL || left == NULL
11056 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011057 if (!PyErr_Occurred())
11058 PyErr_BadInternalCall();
11059 goto error;
11060 }
11061
Benjamin Petersonbac79492012-01-14 13:34:47 -050011062 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011063 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011064 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011065 goto error;
11066
Victor Stinner488fa492011-12-12 00:01:39 +010011067 /* Shortcuts */
11068 if (left == unicode_empty) {
11069 Py_DECREF(left);
11070 Py_INCREF(right);
11071 *p_left = right;
11072 return;
11073 }
11074 if (right == unicode_empty)
11075 return;
11076
11077 left_len = PyUnicode_GET_LENGTH(left);
11078 right_len = PyUnicode_GET_LENGTH(right);
11079 if (left_len > PY_SSIZE_T_MAX - right_len) {
11080 PyErr_SetString(PyExc_OverflowError,
11081 "strings are too large to concat");
11082 goto error;
11083 }
11084 new_len = left_len + right_len;
11085
11086 if (unicode_modifiable(left)
11087 && PyUnicode_CheckExact(right)
11088 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011089 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11090 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011091 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011092 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011093 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11094 {
11095 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011096 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011097 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011098
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011099 /* copy 'right' into the newly allocated area of 'left' */
11100 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011101 }
Victor Stinner488fa492011-12-12 00:01:39 +010011102 else {
11103 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11104 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011105 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011106
Victor Stinner488fa492011-12-12 00:01:39 +010011107 /* Concat the two Unicode strings */
11108 res = PyUnicode_New(new_len, maxchar);
11109 if (res == NULL)
11110 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011111 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11112 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011113 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011114 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011115 }
11116 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011117 return;
11118
11119error:
Victor Stinner488fa492011-12-12 00:01:39 +010011120 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011121}
11122
11123void
11124PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11125{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011126 PyUnicode_Append(pleft, right);
11127 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011128}
11129
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011130PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011131 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011132\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011133Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011134string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011135interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011136
11137static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011138unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011139{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011140 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011141 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011142 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011143 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011144 int kind1, kind2, kind;
11145 void *buf1, *buf2;
11146 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011147
Jesus Ceaac451502011-04-20 17:09:23 +020011148 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11149 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011150 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011151
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011152 kind1 = PyUnicode_KIND(self);
11153 kind2 = PyUnicode_KIND(substring);
Christian Heimesd47802e2013-06-29 21:33:36 +020011154 if (kind2 > kind1) {
11155 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011156 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020011157 }
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011158 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011159 buf1 = PyUnicode_DATA(self);
11160 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011161 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011162 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011163 if (!buf2) {
11164 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011165 return NULL;
11166 }
11167 len1 = PyUnicode_GET_LENGTH(self);
11168 len2 = PyUnicode_GET_LENGTH(substring);
11169
11170 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011171 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011172 case PyUnicode_1BYTE_KIND:
11173 iresult = ucs1lib_count(
11174 ((Py_UCS1*)buf1) + start, end - start,
11175 buf2, len2, PY_SSIZE_T_MAX
11176 );
11177 break;
11178 case PyUnicode_2BYTE_KIND:
11179 iresult = ucs2lib_count(
11180 ((Py_UCS2*)buf1) + start, end - start,
11181 buf2, len2, PY_SSIZE_T_MAX
11182 );
11183 break;
11184 case PyUnicode_4BYTE_KIND:
11185 iresult = ucs4lib_count(
11186 ((Py_UCS4*)buf1) + start, end - start,
11187 buf2, len2, PY_SSIZE_T_MAX
11188 );
11189 break;
11190 default:
11191 assert(0); iresult = 0;
11192 }
11193
11194 result = PyLong_FromSsize_t(iresult);
11195
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011196 if (kind2 != kind)
11197 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011198
11199 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011200
Guido van Rossumd57fd912000-03-10 22:53:23 +000011201 return result;
11202}
11203
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011204PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011205 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011207Encode S using the codec registered for encoding. Default encoding\n\
11208is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011209handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011210a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11211'xmlcharrefreplace' as well as any other name registered with\n\
11212codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011213
11214static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011215unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011216{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011217 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011218 char *encoding = NULL;
11219 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011220
Benjamin Peterson308d6372009-09-18 21:42:35 +000011221 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11222 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011223 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011224 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011225}
11226
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011227PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011228 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011229\n\
11230Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011231If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011232
11233static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011234unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011235{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011236 Py_ssize_t i, j, line_pos, src_len, incr;
11237 Py_UCS4 ch;
11238 PyObject *u;
11239 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011240 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011241 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011242 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011243 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011244
Ezio Melotti745d54d2013-11-16 19:10:57 +020011245 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11246 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011247 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011248
Antoine Pitrou22425222011-10-04 19:10:51 +020011249 if (PyUnicode_READY(self) == -1)
11250 return NULL;
11251
Thomas Wouters7e474022000-07-16 12:04:32 +000011252 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011253 src_len = PyUnicode_GET_LENGTH(self);
11254 i = j = line_pos = 0;
11255 kind = PyUnicode_KIND(self);
11256 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011257 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011258 for (; i < src_len; i++) {
11259 ch = PyUnicode_READ(kind, src_data, i);
11260 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011261 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011262 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011263 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011264 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011265 goto overflow;
11266 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011267 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011268 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011269 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011270 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011271 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011272 goto overflow;
11273 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011274 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011275 if (ch == '\n' || ch == '\r')
11276 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011277 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011278 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011279 if (!found)
11280 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011281
Guido van Rossumd57fd912000-03-10 22:53:23 +000011282 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011283 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011284 if (!u)
11285 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011286 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011287
Antoine Pitroue71d5742011-10-04 15:55:09 +020011288 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011289
Antoine Pitroue71d5742011-10-04 15:55:09 +020011290 for (; i < src_len; i++) {
11291 ch = PyUnicode_READ(kind, src_data, i);
11292 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011293 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011294 incr = tabsize - (line_pos % tabsize);
11295 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011296 FILL(kind, dest_data, ' ', j, incr);
11297 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011298 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011299 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011300 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011301 line_pos++;
11302 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011303 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011304 if (ch == '\n' || ch == '\r')
11305 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011306 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011307 }
11308 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011309 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011310
Antoine Pitroue71d5742011-10-04 15:55:09 +020011311 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011312 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11313 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011314}
11315
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011316PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011317 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011318\n\
11319Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011320such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011321arguments start and end are interpreted as in slice notation.\n\
11322\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011323Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011324
11325static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011326unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011327{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011328 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011329 Py_ssize_t start;
11330 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011331 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011332
Jesus Ceaac451502011-04-20 17:09:23 +020011333 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11334 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011335 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011336
Christian Heimesd47802e2013-06-29 21:33:36 +020011337 if (PyUnicode_READY(self) == -1) {
11338 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011339 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011340 }
11341 if (PyUnicode_READY(substring) == -1) {
11342 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011343 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011344 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011345
Victor Stinner7931d9a2011-11-04 00:22:48 +010011346 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011347
11348 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011350 if (result == -2)
11351 return NULL;
11352
Christian Heimes217cfd12007-12-02 14:31:20 +000011353 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354}
11355
11356static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011357unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011358{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011359 void *data;
11360 enum PyUnicode_Kind kind;
11361 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011362
11363 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11364 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011365 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011366 }
11367 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11368 PyErr_SetString(PyExc_IndexError, "string index out of range");
11369 return NULL;
11370 }
11371 kind = PyUnicode_KIND(self);
11372 data = PyUnicode_DATA(self);
11373 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011374 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011375}
11376
Guido van Rossumc2504932007-09-18 19:42:40 +000011377/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011378 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011379static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011380unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011381{
Guido van Rossumc2504932007-09-18 19:42:40 +000011382 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011383 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011384
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011385#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011386 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011387#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011388 if (_PyUnicode_HASH(self) != -1)
11389 return _PyUnicode_HASH(self);
11390 if (PyUnicode_READY(self) == -1)
11391 return -1;
11392 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011393 /*
11394 We make the hash of the empty string be 0, rather than using
11395 (prefix ^ suffix), since this slightly obfuscates the hash secret
11396 */
11397 if (len == 0) {
11398 _PyUnicode_HASH(self) = 0;
11399 return 0;
11400 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011401 x = _Py_HashBytes(PyUnicode_DATA(self),
11402 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011403 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011404 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011405}
11406
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011407PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011408 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011409\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011410Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011411
11412static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011413unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011414{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011415 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011416 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011417 Py_ssize_t start;
11418 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419
Jesus Ceaac451502011-04-20 17:09:23 +020011420 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11421 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011423
Christian Heimesd47a0452013-06-29 21:21:37 +020011424 if (PyUnicode_READY(self) == -1) {
11425 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011426 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011427 }
11428 if (PyUnicode_READY(substring) == -1) {
11429 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011430 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011431 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011432
Victor Stinner7931d9a2011-11-04 00:22:48 +010011433 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434
11435 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011436
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011437 if (result == -2)
11438 return NULL;
11439
Guido van Rossumd57fd912000-03-10 22:53:23 +000011440 if (result < 0) {
11441 PyErr_SetString(PyExc_ValueError, "substring not found");
11442 return NULL;
11443 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011444
Christian Heimes217cfd12007-12-02 14:31:20 +000011445 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446}
11447
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011448PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011449 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011450\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011451Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011452at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011453
11454static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011455unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011456{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011457 Py_ssize_t i, length;
11458 int kind;
11459 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011460 int cased;
11461
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011462 if (PyUnicode_READY(self) == -1)
11463 return NULL;
11464 length = PyUnicode_GET_LENGTH(self);
11465 kind = PyUnicode_KIND(self);
11466 data = PyUnicode_DATA(self);
11467
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011469 if (length == 1)
11470 return PyBool_FromLong(
11471 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011473 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011474 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011475 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011476
Guido van Rossumd57fd912000-03-10 22:53:23 +000011477 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011478 for (i = 0; i < length; i++) {
11479 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011480
Benjamin Peterson29060642009-01-31 22:14:21 +000011481 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11482 return PyBool_FromLong(0);
11483 else if (!cased && Py_UNICODE_ISLOWER(ch))
11484 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011485 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011486 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011487}
11488
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011489PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011490 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011492Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011493at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011494
11495static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011496unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011497{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011498 Py_ssize_t i, length;
11499 int kind;
11500 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011501 int cased;
11502
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011503 if (PyUnicode_READY(self) == -1)
11504 return NULL;
11505 length = PyUnicode_GET_LENGTH(self);
11506 kind = PyUnicode_KIND(self);
11507 data = PyUnicode_DATA(self);
11508
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011510 if (length == 1)
11511 return PyBool_FromLong(
11512 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011514 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011515 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011516 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011517
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011519 for (i = 0; i < length; i++) {
11520 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011521
Benjamin Peterson29060642009-01-31 22:14:21 +000011522 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11523 return PyBool_FromLong(0);
11524 else if (!cased && Py_UNICODE_ISUPPER(ch))
11525 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011527 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528}
11529
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011530PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011531 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011533Return True if S is a titlecased string and there is at least one\n\
11534character in S, i.e. upper- and titlecase characters may only\n\
11535follow uncased characters and lowercase characters only cased ones.\n\
11536Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537
11538static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011539unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011540{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011541 Py_ssize_t i, length;
11542 int kind;
11543 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544 int cased, previous_is_cased;
11545
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011546 if (PyUnicode_READY(self) == -1)
11547 return NULL;
11548 length = PyUnicode_GET_LENGTH(self);
11549 kind = PyUnicode_KIND(self);
11550 data = PyUnicode_DATA(self);
11551
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011553 if (length == 1) {
11554 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11555 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11556 (Py_UNICODE_ISUPPER(ch) != 0));
11557 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011559 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011560 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011561 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011562
Guido van Rossumd57fd912000-03-10 22:53:23 +000011563 cased = 0;
11564 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011565 for (i = 0; i < length; i++) {
11566 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011567
Benjamin Peterson29060642009-01-31 22:14:21 +000011568 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11569 if (previous_is_cased)
11570 return PyBool_FromLong(0);
11571 previous_is_cased = 1;
11572 cased = 1;
11573 }
11574 else if (Py_UNICODE_ISLOWER(ch)) {
11575 if (!previous_is_cased)
11576 return PyBool_FromLong(0);
11577 previous_is_cased = 1;
11578 cased = 1;
11579 }
11580 else
11581 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011583 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011584}
11585
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011586PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011587 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011588\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011589Return True if all characters in S are whitespace\n\
11590and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011591
11592static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011593unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011594{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011595 Py_ssize_t i, length;
11596 int kind;
11597 void *data;
11598
11599 if (PyUnicode_READY(self) == -1)
11600 return NULL;
11601 length = PyUnicode_GET_LENGTH(self);
11602 kind = PyUnicode_KIND(self);
11603 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011604
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011606 if (length == 1)
11607 return PyBool_FromLong(
11608 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011609
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011610 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011611 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011612 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011613
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011614 for (i = 0; i < length; i++) {
11615 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011616 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011617 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011619 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011620}
11621
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011622PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011623 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011624\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011625Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011626and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011627
11628static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011629unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011630{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011631 Py_ssize_t i, length;
11632 int kind;
11633 void *data;
11634
11635 if (PyUnicode_READY(self) == -1)
11636 return NULL;
11637 length = PyUnicode_GET_LENGTH(self);
11638 kind = PyUnicode_KIND(self);
11639 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011640
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011641 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011642 if (length == 1)
11643 return PyBool_FromLong(
11644 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011645
11646 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011647 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011648 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011649
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011650 for (i = 0; i < length; i++) {
11651 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011652 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011653 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011654 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011655}
11656
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011657PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011658 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011659\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011660Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011661and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011662
11663static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011664unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011665{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011666 int kind;
11667 void *data;
11668 Py_ssize_t len, i;
11669
11670 if (PyUnicode_READY(self) == -1)
11671 return NULL;
11672
11673 kind = PyUnicode_KIND(self);
11674 data = PyUnicode_DATA(self);
11675 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011676
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011677 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011678 if (len == 1) {
11679 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11680 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11681 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011682
11683 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011684 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011685 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011686
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011687 for (i = 0; i < len; i++) {
11688 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011689 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011690 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011691 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011692 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011693}
11694
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011695PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011696 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011697\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011698Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011699False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011700
11701static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011702unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011703{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011704 Py_ssize_t i, length;
11705 int kind;
11706 void *data;
11707
11708 if (PyUnicode_READY(self) == -1)
11709 return NULL;
11710 length = PyUnicode_GET_LENGTH(self);
11711 kind = PyUnicode_KIND(self);
11712 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713
Guido van Rossumd57fd912000-03-10 22:53:23 +000011714 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011715 if (length == 1)
11716 return PyBool_FromLong(
11717 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011718
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011719 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011720 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011721 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011723 for (i = 0; i < length; i++) {
11724 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011725 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011727 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728}
11729
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011730PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011731 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011732\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011733Return True if all characters in S are digits\n\
11734and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011735
11736static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011737unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011738{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011739 Py_ssize_t i, length;
11740 int kind;
11741 void *data;
11742
11743 if (PyUnicode_READY(self) == -1)
11744 return NULL;
11745 length = PyUnicode_GET_LENGTH(self);
11746 kind = PyUnicode_KIND(self);
11747 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011748
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011750 if (length == 1) {
11751 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11752 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11753 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011754
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011755 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011756 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011757 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011758
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011759 for (i = 0; i < length; i++) {
11760 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011761 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011763 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764}
11765
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011766PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011767 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011769Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011770False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011771
11772static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011773unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011775 Py_ssize_t i, length;
11776 int kind;
11777 void *data;
11778
11779 if (PyUnicode_READY(self) == -1)
11780 return NULL;
11781 length = PyUnicode_GET_LENGTH(self);
11782 kind = PyUnicode_KIND(self);
11783 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011786 if (length == 1)
11787 return PyBool_FromLong(
11788 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011790 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011791 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011792 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011793
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011794 for (i = 0; i < length; i++) {
11795 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011796 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011797 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011798 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799}
11800
Martin v. Löwis47383402007-08-15 07:32:56 +000011801int
11802PyUnicode_IsIdentifier(PyObject *self)
11803{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011804 int kind;
11805 void *data;
11806 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011807 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011808
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011809 if (PyUnicode_READY(self) == -1) {
11810 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011811 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011812 }
11813
11814 /* Special case for empty strings */
11815 if (PyUnicode_GET_LENGTH(self) == 0)
11816 return 0;
11817 kind = PyUnicode_KIND(self);
11818 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011819
11820 /* PEP 3131 says that the first character must be in
11821 XID_Start and subsequent characters in XID_Continue,
11822 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011823 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011824 letters, digits, underscore). However, given the current
11825 definition of XID_Start and XID_Continue, it is sufficient
11826 to check just for these, except that _ must be allowed
11827 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011828 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011829 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011830 return 0;
11831
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011832 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011833 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011834 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011835 return 1;
11836}
11837
11838PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011839 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011840\n\
11841Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011842to the language definition.\n\
11843\n\
11844Use keyword.iskeyword() to test for reserved identifiers\n\
11845such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011846
11847static PyObject*
11848unicode_isidentifier(PyObject *self)
11849{
11850 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11851}
11852
Georg Brandl559e5d72008-06-11 18:37:52 +000011853PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011854 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011855\n\
11856Return True if all characters in S are considered\n\
11857printable in repr() or S is empty, False otherwise.");
11858
11859static PyObject*
11860unicode_isprintable(PyObject *self)
11861{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011862 Py_ssize_t i, length;
11863 int kind;
11864 void *data;
11865
11866 if (PyUnicode_READY(self) == -1)
11867 return NULL;
11868 length = PyUnicode_GET_LENGTH(self);
11869 kind = PyUnicode_KIND(self);
11870 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011871
11872 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011873 if (length == 1)
11874 return PyBool_FromLong(
11875 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011876
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011877 for (i = 0; i < length; i++) {
11878 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011879 Py_RETURN_FALSE;
11880 }
11881 }
11882 Py_RETURN_TRUE;
11883}
11884
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011885PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011886 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011887\n\
11888Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011889iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890
11891static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011892unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011894 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011895}
11896
Martin v. Löwis18e16552006-02-15 17:27:45 +000011897static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011898unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011900 if (PyUnicode_READY(self) == -1)
11901 return -1;
11902 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011903}
11904
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011905PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011906 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011908Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011909done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011910
11911static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011912unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011914 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011915 Py_UCS4 fillchar = ' ';
11916
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011917 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011918 return NULL;
11919
Benjamin Petersonbac79492012-01-14 13:34:47 -050011920 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011921 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922
Victor Stinnerc4b49542011-12-11 22:44:26 +010011923 if (PyUnicode_GET_LENGTH(self) >= width)
11924 return unicode_result_unchanged(self);
11925
11926 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011927}
11928
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011929PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011930 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011932Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011933
11934static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011935unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011937 if (PyUnicode_READY(self) == -1)
11938 return NULL;
11939 if (PyUnicode_IS_ASCII(self))
11940 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011941 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011942}
11943
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011944#define LEFTSTRIP 0
11945#define RIGHTSTRIP 1
11946#define BOTHSTRIP 2
11947
11948/* Arrays indexed by above */
11949static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11950
11951#define STRIPNAME(i) (stripformat[i]+3)
11952
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011953/* externally visible for str.strip(unicode) */
11954PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011955_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011956{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011957 void *data;
11958 int kind;
11959 Py_ssize_t i, j, len;
11960 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011961 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011962
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011963 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11964 return NULL;
11965
11966 kind = PyUnicode_KIND(self);
11967 data = PyUnicode_DATA(self);
11968 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020011969 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011970 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11971 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020011972 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011973
Benjamin Peterson14339b62009-01-31 16:36:08 +000011974 i = 0;
11975 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011976 while (i < len) {
11977 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11978 if (!BLOOM(sepmask, ch))
11979 break;
11980 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11981 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011982 i++;
11983 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011984 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011985
Benjamin Peterson14339b62009-01-31 16:36:08 +000011986 j = len;
11987 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011988 j--;
11989 while (j >= i) {
11990 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11991 if (!BLOOM(sepmask, ch))
11992 break;
11993 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11994 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011995 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011996 }
11997
Benjamin Peterson29060642009-01-31 22:14:21 +000011998 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011999 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012000
Victor Stinner7931d9a2011-11-04 00:22:48 +010012001 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012002}
12003
12004PyObject*
12005PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12006{
12007 unsigned char *data;
12008 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012009 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012010
Victor Stinnerde636f32011-10-01 03:55:54 +020012011 if (PyUnicode_READY(self) == -1)
12012 return NULL;
12013
Victor Stinner684d5fd2012-05-03 02:32:34 +020012014 length = PyUnicode_GET_LENGTH(self);
12015 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012016
Victor Stinner684d5fd2012-05-03 02:32:34 +020012017 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012018 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012019
Victor Stinnerde636f32011-10-01 03:55:54 +020012020 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012021 PyErr_SetString(PyExc_IndexError, "string index out of range");
12022 return NULL;
12023 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012024 if (start >= length || end < start)
12025 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012026
Victor Stinner684d5fd2012-05-03 02:32:34 +020012027 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012028 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012029 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012030 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012031 }
12032 else {
12033 kind = PyUnicode_KIND(self);
12034 data = PyUnicode_1BYTE_DATA(self);
12035 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012036 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012037 length);
12038 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012039}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012040
12041static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012042do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012043{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012044 Py_ssize_t len, i, j;
12045
12046 if (PyUnicode_READY(self) == -1)
12047 return NULL;
12048
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012049 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012050
Victor Stinnercc7af722013-04-09 22:39:24 +020012051 if (PyUnicode_IS_ASCII(self)) {
12052 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12053
12054 i = 0;
12055 if (striptype != RIGHTSTRIP) {
12056 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012057 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012058 if (!_Py_ascii_whitespace[ch])
12059 break;
12060 i++;
12061 }
12062 }
12063
12064 j = len;
12065 if (striptype != LEFTSTRIP) {
12066 j--;
12067 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012068 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012069 if (!_Py_ascii_whitespace[ch])
12070 break;
12071 j--;
12072 }
12073 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012074 }
12075 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012076 else {
12077 int kind = PyUnicode_KIND(self);
12078 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012079
Victor Stinnercc7af722013-04-09 22:39:24 +020012080 i = 0;
12081 if (striptype != RIGHTSTRIP) {
12082 while (i < len) {
12083 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12084 if (!Py_UNICODE_ISSPACE(ch))
12085 break;
12086 i++;
12087 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012088 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012089
12090 j = len;
12091 if (striptype != LEFTSTRIP) {
12092 j--;
12093 while (j >= i) {
12094 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12095 if (!Py_UNICODE_ISSPACE(ch))
12096 break;
12097 j--;
12098 }
12099 j++;
12100 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012101 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012102
Victor Stinner7931d9a2011-11-04 00:22:48 +010012103 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012104}
12105
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012106
12107static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012108do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012109{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012110 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012111
Serhiy Storchakac6792272013-10-19 21:03:34 +030012112 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012113 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012114
Benjamin Peterson14339b62009-01-31 16:36:08 +000012115 if (sep != NULL && sep != Py_None) {
12116 if (PyUnicode_Check(sep))
12117 return _PyUnicode_XStrip(self, striptype, sep);
12118 else {
12119 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012120 "%s arg must be None or str",
12121 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012122 return NULL;
12123 }
12124 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012125
Benjamin Peterson14339b62009-01-31 16:36:08 +000012126 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012127}
12128
12129
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012130PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012131 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012132\n\
12133Return a copy of the string S with leading and trailing\n\
12134whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012135If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012136
12137static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012138unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012139{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012140 if (PyTuple_GET_SIZE(args) == 0)
12141 return do_strip(self, BOTHSTRIP); /* Common case */
12142 else
12143 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012144}
12145
12146
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012147PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012148 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012149\n\
12150Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012151If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012152
12153static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012154unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012155{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012156 if (PyTuple_GET_SIZE(args) == 0)
12157 return do_strip(self, LEFTSTRIP); /* Common case */
12158 else
12159 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012160}
12161
12162
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012163PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012164 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012165\n\
12166Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012167If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012168
12169static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012170unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012171{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012172 if (PyTuple_GET_SIZE(args) == 0)
12173 return do_strip(self, RIGHTSTRIP); /* Common case */
12174 else
12175 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012176}
12177
12178
Guido van Rossumd57fd912000-03-10 22:53:23 +000012179static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012180unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012181{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012182 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012183 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012184
Serhiy Storchaka05997252013-01-26 12:14:02 +020012185 if (len < 1)
12186 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012187
Victor Stinnerc4b49542011-12-11 22:44:26 +010012188 /* no repeat, return original string */
12189 if (len == 1)
12190 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012191
Benjamin Petersonbac79492012-01-14 13:34:47 -050012192 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012193 return NULL;
12194
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012195 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012196 PyErr_SetString(PyExc_OverflowError,
12197 "repeated string is too long");
12198 return NULL;
12199 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012200 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012201
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012202 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203 if (!u)
12204 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012205 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012206
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012207 if (PyUnicode_GET_LENGTH(str) == 1) {
12208 const int kind = PyUnicode_KIND(str);
12209 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012210 if (kind == PyUnicode_1BYTE_KIND) {
12211 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012212 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012213 }
12214 else if (kind == PyUnicode_2BYTE_KIND) {
12215 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012216 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012217 ucs2[n] = fill_char;
12218 } else {
12219 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12220 assert(kind == PyUnicode_4BYTE_KIND);
12221 for (n = 0; n < len; ++n)
12222 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012223 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012224 }
12225 else {
12226 /* number of characters copied this far */
12227 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012228 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012229 char *to = (char *) PyUnicode_DATA(u);
12230 Py_MEMCPY(to, PyUnicode_DATA(str),
12231 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012232 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012233 n = (done <= nchars-done) ? done : nchars-done;
12234 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012235 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012236 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237 }
12238
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012239 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012240 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012241}
12242
Alexander Belopolsky40018472011-02-26 01:02:56 +000012243PyObject *
12244PyUnicode_Replace(PyObject *obj,
12245 PyObject *subobj,
12246 PyObject *replobj,
12247 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012248{
12249 PyObject *self;
12250 PyObject *str1;
12251 PyObject *str2;
12252 PyObject *result;
12253
12254 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012255 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012256 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012257 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012258 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012259 Py_DECREF(self);
12260 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012261 }
12262 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012263 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012264 Py_DECREF(self);
12265 Py_DECREF(str1);
12266 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012268 if (PyUnicode_READY(self) == -1 ||
12269 PyUnicode_READY(str1) == -1 ||
12270 PyUnicode_READY(str2) == -1)
12271 result = NULL;
12272 else
12273 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012274 Py_DECREF(self);
12275 Py_DECREF(str1);
12276 Py_DECREF(str2);
12277 return result;
12278}
12279
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012280PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012281 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012282\n\
12283Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012284old replaced by new. If the optional argument count is\n\
12285given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012286
12287static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012288unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012289{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012290 PyObject *str1;
12291 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012292 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012293 PyObject *result;
12294
Martin v. Löwis18e16552006-02-15 17:27:45 +000012295 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012296 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012297 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012298 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012299 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012300 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012301 return NULL;
12302 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012303 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012304 Py_DECREF(str1);
12305 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012306 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012307 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12308 result = NULL;
12309 else
12310 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012311
12312 Py_DECREF(str1);
12313 Py_DECREF(str2);
12314 return result;
12315}
12316
Alexander Belopolsky40018472011-02-26 01:02:56 +000012317static PyObject *
12318unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012319{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012320 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012321 Py_ssize_t isize;
12322 Py_ssize_t osize, squote, dquote, i, o;
12323 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012324 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012325 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012326
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012327 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012328 return NULL;
12329
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012330 isize = PyUnicode_GET_LENGTH(unicode);
12331 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012333 /* Compute length of output, quote characters, and
12334 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012335 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012336 max = 127;
12337 squote = dquote = 0;
12338 ikind = PyUnicode_KIND(unicode);
12339 for (i = 0; i < isize; i++) {
12340 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012341 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012342 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012343 case '\'': squote++; break;
12344 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012345 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012346 incr = 2;
12347 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012348 default:
12349 /* Fast-path ASCII */
12350 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012351 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012352 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012353 ;
12354 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012355 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012356 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012357 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012358 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012359 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012360 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012361 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012362 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012363 if (osize > PY_SSIZE_T_MAX - incr) {
12364 PyErr_SetString(PyExc_OverflowError,
12365 "string is too long to generate repr");
12366 return NULL;
12367 }
12368 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012369 }
12370
12371 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012372 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012373 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012374 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012375 if (dquote)
12376 /* Both squote and dquote present. Use squote,
12377 and escape them */
12378 osize += squote;
12379 else
12380 quote = '"';
12381 }
Victor Stinner55c08782013-04-14 18:45:39 +020012382 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012383
12384 repr = PyUnicode_New(osize, max);
12385 if (repr == NULL)
12386 return NULL;
12387 okind = PyUnicode_KIND(repr);
12388 odata = PyUnicode_DATA(repr);
12389
12390 PyUnicode_WRITE(okind, odata, 0, quote);
12391 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012392 if (unchanged) {
12393 _PyUnicode_FastCopyCharacters(repr, 1,
12394 unicode, 0,
12395 isize);
12396 }
12397 else {
12398 for (i = 0, o = 1; i < isize; i++) {
12399 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012400
Victor Stinner55c08782013-04-14 18:45:39 +020012401 /* Escape quotes and backslashes */
12402 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012403 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012404 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012405 continue;
12406 }
12407
12408 /* Map special whitespace to '\t', \n', '\r' */
12409 if (ch == '\t') {
12410 PyUnicode_WRITE(okind, odata, o++, '\\');
12411 PyUnicode_WRITE(okind, odata, o++, 't');
12412 }
12413 else if (ch == '\n') {
12414 PyUnicode_WRITE(okind, odata, o++, '\\');
12415 PyUnicode_WRITE(okind, odata, o++, 'n');
12416 }
12417 else if (ch == '\r') {
12418 PyUnicode_WRITE(okind, odata, o++, '\\');
12419 PyUnicode_WRITE(okind, odata, o++, 'r');
12420 }
12421
12422 /* Map non-printable US ASCII to '\xhh' */
12423 else if (ch < ' ' || ch == 0x7F) {
12424 PyUnicode_WRITE(okind, odata, o++, '\\');
12425 PyUnicode_WRITE(okind, odata, o++, 'x');
12426 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12427 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12428 }
12429
12430 /* Copy ASCII characters as-is */
12431 else if (ch < 0x7F) {
12432 PyUnicode_WRITE(okind, odata, o++, ch);
12433 }
12434
12435 /* Non-ASCII characters */
12436 else {
12437 /* Map Unicode whitespace and control characters
12438 (categories Z* and C* except ASCII space)
12439 */
12440 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12441 PyUnicode_WRITE(okind, odata, o++, '\\');
12442 /* Map 8-bit characters to '\xhh' */
12443 if (ch <= 0xff) {
12444 PyUnicode_WRITE(okind, odata, o++, 'x');
12445 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12446 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12447 }
12448 /* Map 16-bit characters to '\uxxxx' */
12449 else if (ch <= 0xffff) {
12450 PyUnicode_WRITE(okind, odata, o++, 'u');
12451 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12452 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12453 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12454 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12455 }
12456 /* Map 21-bit characters to '\U00xxxxxx' */
12457 else {
12458 PyUnicode_WRITE(okind, odata, o++, 'U');
12459 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12460 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12461 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12462 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12463 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12464 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12465 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12466 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12467 }
12468 }
12469 /* Copy characters as-is */
12470 else {
12471 PyUnicode_WRITE(okind, odata, o++, ch);
12472 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012473 }
12474 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012475 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012476 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012477 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012478 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012479}
12480
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012481PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012482 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012483\n\
12484Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012485such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012486arguments start and end are interpreted as in slice notation.\n\
12487\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012488Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012489
12490static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012491unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012492{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012493 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012494 Py_ssize_t start;
12495 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012496 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012497
Jesus Ceaac451502011-04-20 17:09:23 +020012498 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12499 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012500 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012501
Christian Heimesea71a522013-06-29 21:17:34 +020012502 if (PyUnicode_READY(self) == -1) {
12503 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012504 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012505 }
12506 if (PyUnicode_READY(substring) == -1) {
12507 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012508 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012509 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012510
Victor Stinner7931d9a2011-11-04 00:22:48 +010012511 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012512
12513 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012514
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012515 if (result == -2)
12516 return NULL;
12517
Christian Heimes217cfd12007-12-02 14:31:20 +000012518 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012519}
12520
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012521PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012522 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012523\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012524Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525
12526static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012527unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012528{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012529 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012530 Py_ssize_t start;
12531 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012532 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012533
Jesus Ceaac451502011-04-20 17:09:23 +020012534 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12535 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012536 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012537
Christian Heimesea71a522013-06-29 21:17:34 +020012538 if (PyUnicode_READY(self) == -1) {
12539 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012540 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012541 }
12542 if (PyUnicode_READY(substring) == -1) {
12543 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012544 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012545 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012546
Victor Stinner7931d9a2011-11-04 00:22:48 +010012547 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012548
12549 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012550
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012551 if (result == -2)
12552 return NULL;
12553
Guido van Rossumd57fd912000-03-10 22:53:23 +000012554 if (result < 0) {
12555 PyErr_SetString(PyExc_ValueError, "substring not found");
12556 return NULL;
12557 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012558
Christian Heimes217cfd12007-12-02 14:31:20 +000012559 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012560}
12561
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012562PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012563 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012564\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012565Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012566done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012567
12568static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012569unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012570{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012571 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012572 Py_UCS4 fillchar = ' ';
12573
Victor Stinnere9a29352011-10-01 02:14:59 +020012574 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012575 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012576
Benjamin Petersonbac79492012-01-14 13:34:47 -050012577 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012578 return NULL;
12579
Victor Stinnerc4b49542011-12-11 22:44:26 +010012580 if (PyUnicode_GET_LENGTH(self) >= width)
12581 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012582
Victor Stinnerc4b49542011-12-11 22:44:26 +010012583 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012584}
12585
Alexander Belopolsky40018472011-02-26 01:02:56 +000012586PyObject *
12587PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012588{
12589 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012590
Guido van Rossumd57fd912000-03-10 22:53:23 +000012591 s = PyUnicode_FromObject(s);
12592 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012593 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012594 if (sep != NULL) {
12595 sep = PyUnicode_FromObject(sep);
12596 if (sep == NULL) {
12597 Py_DECREF(s);
12598 return NULL;
12599 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012600 }
12601
Victor Stinner9310abb2011-10-05 00:59:23 +020012602 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012603
12604 Py_DECREF(s);
12605 Py_XDECREF(sep);
12606 return result;
12607}
12608
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012609PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012610 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012611\n\
12612Return a list of the words in S, using sep as the\n\
12613delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012614splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012615whitespace string is a separator and empty strings are\n\
12616removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012617
12618static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012619unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012620{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012621 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012622 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012623 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012624
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012625 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12626 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012627 return NULL;
12628
12629 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012630 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012631 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012632 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012633 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012634 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012635}
12636
Thomas Wouters477c8d52006-05-27 19:21:47 +000012637PyObject *
12638PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12639{
12640 PyObject* str_obj;
12641 PyObject* sep_obj;
12642 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643 int kind1, kind2, kind;
12644 void *buf1 = NULL, *buf2 = NULL;
12645 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012646
12647 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012648 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012649 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012650 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012651 if (!sep_obj) {
12652 Py_DECREF(str_obj);
12653 return NULL;
12654 }
12655 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12656 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012657 Py_DECREF(str_obj);
12658 return NULL;
12659 }
12660
Victor Stinner14f8f022011-10-05 20:58:25 +020012661 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012662 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012663 kind = Py_MAX(kind1, kind2);
12664 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012665 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012666 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012667 if (!buf1)
12668 goto onError;
12669 buf2 = PyUnicode_DATA(sep_obj);
12670 if (kind2 != kind)
12671 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12672 if (!buf2)
12673 goto onError;
12674 len1 = PyUnicode_GET_LENGTH(str_obj);
12675 len2 = PyUnicode_GET_LENGTH(sep_obj);
12676
Benjamin Petersonead6b532011-12-20 17:23:42 -060012677 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012678 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012679 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12680 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12681 else
12682 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012683 break;
12684 case PyUnicode_2BYTE_KIND:
12685 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12686 break;
12687 case PyUnicode_4BYTE_KIND:
12688 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12689 break;
12690 default:
12691 assert(0);
12692 out = 0;
12693 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012694
12695 Py_DECREF(sep_obj);
12696 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012697 if (kind1 != kind)
12698 PyMem_Free(buf1);
12699 if (kind2 != kind)
12700 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012701
12702 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012703 onError:
12704 Py_DECREF(sep_obj);
12705 Py_DECREF(str_obj);
12706 if (kind1 != kind && buf1)
12707 PyMem_Free(buf1);
12708 if (kind2 != kind && buf2)
12709 PyMem_Free(buf2);
12710 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012711}
12712
12713
12714PyObject *
12715PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12716{
12717 PyObject* str_obj;
12718 PyObject* sep_obj;
12719 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012720 int kind1, kind2, kind;
12721 void *buf1 = NULL, *buf2 = NULL;
12722 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012723
12724 str_obj = PyUnicode_FromObject(str_in);
12725 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012726 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012727 sep_obj = PyUnicode_FromObject(sep_in);
12728 if (!sep_obj) {
12729 Py_DECREF(str_obj);
12730 return NULL;
12731 }
12732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012733 kind1 = PyUnicode_KIND(str_in);
12734 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012735 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012736 buf1 = PyUnicode_DATA(str_in);
12737 if (kind1 != kind)
12738 buf1 = _PyUnicode_AsKind(str_in, kind);
12739 if (!buf1)
12740 goto onError;
12741 buf2 = PyUnicode_DATA(sep_obj);
12742 if (kind2 != kind)
12743 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12744 if (!buf2)
12745 goto onError;
12746 len1 = PyUnicode_GET_LENGTH(str_obj);
12747 len2 = PyUnicode_GET_LENGTH(sep_obj);
12748
Benjamin Petersonead6b532011-12-20 17:23:42 -060012749 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012750 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012751 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12752 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12753 else
12754 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012755 break;
12756 case PyUnicode_2BYTE_KIND:
12757 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12758 break;
12759 case PyUnicode_4BYTE_KIND:
12760 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12761 break;
12762 default:
12763 assert(0);
12764 out = 0;
12765 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012766
12767 Py_DECREF(sep_obj);
12768 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012769 if (kind1 != kind)
12770 PyMem_Free(buf1);
12771 if (kind2 != kind)
12772 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012773
12774 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012775 onError:
12776 Py_DECREF(sep_obj);
12777 Py_DECREF(str_obj);
12778 if (kind1 != kind && buf1)
12779 PyMem_Free(buf1);
12780 if (kind2 != kind && buf2)
12781 PyMem_Free(buf2);
12782 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012783}
12784
12785PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012786 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012787\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012788Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012789the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012790found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012791
12792static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012793unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012794{
Victor Stinner9310abb2011-10-05 00:59:23 +020012795 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012796}
12797
12798PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012799 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012800\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012801Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012802the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012803separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012804
12805static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012806unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012807{
Victor Stinner9310abb2011-10-05 00:59:23 +020012808 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012809}
12810
Alexander Belopolsky40018472011-02-26 01:02:56 +000012811PyObject *
12812PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012813{
12814 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012815
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012816 s = PyUnicode_FromObject(s);
12817 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012818 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012819 if (sep != NULL) {
12820 sep = PyUnicode_FromObject(sep);
12821 if (sep == NULL) {
12822 Py_DECREF(s);
12823 return NULL;
12824 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012825 }
12826
Victor Stinner9310abb2011-10-05 00:59:23 +020012827 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012828
12829 Py_DECREF(s);
12830 Py_XDECREF(sep);
12831 return result;
12832}
12833
12834PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012835 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012836\n\
12837Return a list of the words in S, using sep as the\n\
12838delimiter string, starting at the end of the string and\n\
12839working to the front. If maxsplit is given, at most maxsplit\n\
12840splits are done. If sep is not specified, any whitespace string\n\
12841is a separator.");
12842
12843static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012844unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012845{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012846 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012847 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012848 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012849
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012850 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12851 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012852 return NULL;
12853
12854 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012855 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012856 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012857 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012858 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012859 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012860}
12861
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012862PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012863 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012864\n\
12865Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012866Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012867is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012868
12869static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012870unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012871{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012872 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012873 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012874
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012875 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12876 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012877 return NULL;
12878
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012879 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012880}
12881
12882static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012883PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012884{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012885 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012886}
12887
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012888PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012889 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012890\n\
12891Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012892and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012893
12894static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012895unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012896{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012897 if (PyUnicode_READY(self) == -1)
12898 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012899 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012900}
12901
Larry Hastings61272b72014-01-07 12:41:53 -080012902/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012903
Larry Hastings31826802013-10-19 00:09:25 -070012904@staticmethod
12905str.maketrans as unicode_maketrans
12906
12907 x: object
12908
12909 y: unicode=NULL
12910
12911 z: unicode=NULL
12912
12913 /
12914
12915Return a translation table usable for str.translate().
12916
12917If there is only one argument, it must be a dictionary mapping Unicode
12918ordinals (integers) or characters to Unicode ordinals, strings or None.
12919Character keys will be then converted to ordinals.
12920If there are two arguments, they must be strings of equal length, and
12921in the resulting dictionary, each character in x will be mapped to the
12922character at the same position in y. If there is a third argument, it
12923must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080012924[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070012925
12926PyDoc_STRVAR(unicode_maketrans__doc__,
Larry Hastings2623c8c2014-02-08 22:15:29 -080012927"maketrans(x, y=None, z=None, /)\n"
12928"--\n"
12929"\n"
Larry Hastings31826802013-10-19 00:09:25 -070012930"Return a translation table usable for str.translate().\n"
12931"\n"
Larry Hastings31826802013-10-19 00:09:25 -070012932"If there is only one argument, it must be a dictionary mapping Unicode\n"
12933"ordinals (integers) or characters to Unicode ordinals, strings or None.\n"
12934"Character keys will be then converted to ordinals.\n"
12935"If there are two arguments, they must be strings of equal length, and\n"
12936"in the resulting dictionary, each character in x will be mapped to the\n"
12937"character at the same position in y. If there is a third argument, it\n"
12938"must be a string, whose characters will be mapped to None in the result.");
12939
12940#define UNICODE_MAKETRANS_METHODDEF \
12941 {"maketrans", (PyCFunction)unicode_maketrans, METH_VARARGS|METH_STATIC, unicode_maketrans__doc__},
12942
12943static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012944unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z);
Larry Hastings31826802013-10-19 00:09:25 -070012945
12946static PyObject *
Larry Hastingsebdcb502013-11-23 14:54:00 -080012947unicode_maketrans(void *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012948{
Larry Hastings31826802013-10-19 00:09:25 -070012949 PyObject *return_value = NULL;
12950 PyObject *x;
12951 PyObject *y = NULL;
12952 PyObject *z = NULL;
12953
12954 if (!PyArg_ParseTuple(args,
12955 "O|UU:maketrans",
12956 &x, &y, &z))
12957 goto exit;
Larry Hastings5c661892014-01-24 06:17:25 -080012958 return_value = unicode_maketrans_impl(x, y, z);
Larry Hastings31826802013-10-19 00:09:25 -070012959
12960exit:
12961 return return_value;
12962}
12963
12964static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012965unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Larry Hastings2623c8c2014-02-08 22:15:29 -080012966/*[clinic end generated code: output=566edf630f77436a input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070012967{
Georg Brandlceee0772007-11-27 23:48:05 +000012968 PyObject *new = NULL, *key, *value;
12969 Py_ssize_t i = 0;
12970 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012971
Georg Brandlceee0772007-11-27 23:48:05 +000012972 new = PyDict_New();
12973 if (!new)
12974 return NULL;
12975 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012976 int x_kind, y_kind, z_kind;
12977 void *x_data, *y_data, *z_data;
12978
Georg Brandlceee0772007-11-27 23:48:05 +000012979 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012980 if (!PyUnicode_Check(x)) {
12981 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12982 "be a string if there is a second argument");
12983 goto err;
12984 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012985 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012986 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12987 "arguments must have equal length");
12988 goto err;
12989 }
12990 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012991 x_kind = PyUnicode_KIND(x);
12992 y_kind = PyUnicode_KIND(y);
12993 x_data = PyUnicode_DATA(x);
12994 y_data = PyUnicode_DATA(y);
12995 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12996 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012997 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012998 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012999 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013000 if (!value) {
13001 Py_DECREF(key);
13002 goto err;
13003 }
Georg Brandlceee0772007-11-27 23:48:05 +000013004 res = PyDict_SetItem(new, key, value);
13005 Py_DECREF(key);
13006 Py_DECREF(value);
13007 if (res < 0)
13008 goto err;
13009 }
13010 /* create entries for deleting chars in z */
13011 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013012 z_kind = PyUnicode_KIND(z);
13013 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013014 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013015 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013016 if (!key)
13017 goto err;
13018 res = PyDict_SetItem(new, key, Py_None);
13019 Py_DECREF(key);
13020 if (res < 0)
13021 goto err;
13022 }
13023 }
13024 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013025 int kind;
13026 void *data;
13027
Georg Brandlceee0772007-11-27 23:48:05 +000013028 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013029 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013030 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13031 "to maketrans it must be a dict");
13032 goto err;
13033 }
13034 /* copy entries into the new dict, converting string keys to int keys */
13035 while (PyDict_Next(x, &i, &key, &value)) {
13036 if (PyUnicode_Check(key)) {
13037 /* convert string keys to integer keys */
13038 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013039 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013040 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13041 "table must be of length 1");
13042 goto err;
13043 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013044 kind = PyUnicode_KIND(key);
13045 data = PyUnicode_DATA(key);
13046 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013047 if (!newkey)
13048 goto err;
13049 res = PyDict_SetItem(new, newkey, value);
13050 Py_DECREF(newkey);
13051 if (res < 0)
13052 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013053 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013054 /* just keep integer keys */
13055 if (PyDict_SetItem(new, key, value) < 0)
13056 goto err;
13057 } else {
13058 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13059 "be strings or integers");
13060 goto err;
13061 }
13062 }
13063 }
13064 return new;
13065 err:
13066 Py_DECREF(new);
13067 return NULL;
13068}
13069
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013070PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013071 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013072\n\
13073Return a copy of the string S, where all characters have been mapped\n\
13074through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000013075Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000013076Unmapped characters are left untouched. Characters mapped to None\n\
13077are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013078
13079static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013080unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013081{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013082 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013083}
13084
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013085PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013086 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013087\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013088Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013089
13090static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013091unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013092{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013093 if (PyUnicode_READY(self) == -1)
13094 return NULL;
13095 if (PyUnicode_IS_ASCII(self))
13096 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013097 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013098}
13099
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013100PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013101 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013102\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013103Pad a numeric string S with zeros on the left, to fill a field\n\
13104of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013105
13106static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013107unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013108{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013109 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013110 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013111 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013112 int kind;
13113 void *data;
13114 Py_UCS4 chr;
13115
Martin v. Löwis18e16552006-02-15 17:27:45 +000013116 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013117 return NULL;
13118
Benjamin Petersonbac79492012-01-14 13:34:47 -050013119 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013120 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013121
Victor Stinnerc4b49542011-12-11 22:44:26 +010013122 if (PyUnicode_GET_LENGTH(self) >= width)
13123 return unicode_result_unchanged(self);
13124
13125 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013126
13127 u = pad(self, fill, 0, '0');
13128
Walter Dörwald068325e2002-04-15 13:36:47 +000013129 if (u == NULL)
13130 return NULL;
13131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013132 kind = PyUnicode_KIND(u);
13133 data = PyUnicode_DATA(u);
13134 chr = PyUnicode_READ(kind, data, fill);
13135
13136 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013137 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013138 PyUnicode_WRITE(kind, data, 0, chr);
13139 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013140 }
13141
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013142 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013143 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013144}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013145
13146#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013147static PyObject *
13148unicode__decimal2ascii(PyObject *self)
13149{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013150 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013151}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013152#endif
13153
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013154PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013155 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013156\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013157Return True if S starts with the specified prefix, False otherwise.\n\
13158With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013159With optional end, stop comparing S at that position.\n\
13160prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013161
13162static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013163unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013164 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013165{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013166 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013167 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013168 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013169 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013170 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013171
Jesus Ceaac451502011-04-20 17:09:23 +020013172 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013173 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013174 if (PyTuple_Check(subobj)) {
13175 Py_ssize_t i;
13176 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013177 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013178 if (substring == NULL)
13179 return NULL;
13180 result = tailmatch(self, substring, start, end, -1);
13181 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013182 if (result == -1)
13183 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013184 if (result) {
13185 Py_RETURN_TRUE;
13186 }
13187 }
13188 /* nothing matched */
13189 Py_RETURN_FALSE;
13190 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013191 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013192 if (substring == NULL) {
13193 if (PyErr_ExceptionMatches(PyExc_TypeError))
13194 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13195 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013196 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013197 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013198 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013199 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013200 if (result == -1)
13201 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013202 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013203}
13204
13205
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013206PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013207 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013208\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013209Return True if S ends with the specified suffix, False otherwise.\n\
13210With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013211With optional end, stop comparing S at that position.\n\
13212suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013213
13214static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013215unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013216 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013217{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013218 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013219 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013220 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013221 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013222 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013223
Jesus Ceaac451502011-04-20 17:09:23 +020013224 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013225 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013226 if (PyTuple_Check(subobj)) {
13227 Py_ssize_t i;
13228 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013229 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013230 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013231 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013232 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013233 result = tailmatch(self, substring, start, end, +1);
13234 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013235 if (result == -1)
13236 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013237 if (result) {
13238 Py_RETURN_TRUE;
13239 }
13240 }
13241 Py_RETURN_FALSE;
13242 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013243 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013244 if (substring == NULL) {
13245 if (PyErr_ExceptionMatches(PyExc_TypeError))
13246 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13247 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013248 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013249 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013250 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020013251 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013252 if (result == -1)
13253 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013254 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013255}
13256
Victor Stinner202fdca2012-05-07 12:47:02 +020013257Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013258_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013259{
Victor Stinner8f674cc2013-04-17 23:02:17 +020013260 if (!writer->readonly)
13261 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13262 else {
13263 /* Copy-on-write mode: set buffer size to 0 so
13264 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13265 * next write. */
13266 writer->size = 0;
13267 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013268 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13269 writer->data = PyUnicode_DATA(writer->buffer);
13270 writer->kind = PyUnicode_KIND(writer->buffer);
13271}
13272
Victor Stinnerd3f08822012-05-29 12:57:52 +020013273void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013274_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013275{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013276 memset(writer, 0, sizeof(*writer));
13277#ifdef Py_DEBUG
13278 writer->kind = 5; /* invalid kind */
13279#endif
Victor Stinner8f674cc2013-04-17 23:02:17 +020013280 writer->min_char = 127;
Victor Stinner202fdca2012-05-07 12:47:02 +020013281}
13282
Victor Stinnerd3f08822012-05-29 12:57:52 +020013283int
13284_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13285 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013286{
Victor Stinner6989ba02013-11-18 21:08:39 +010013287#ifdef MS_WINDOWS
13288 /* On Windows, overallocate by 50% is the best factor */
13289# define OVERALLOCATE_FACTOR 2
13290#else
13291 /* On Linux, overallocate by 25% is the best factor */
13292# define OVERALLOCATE_FACTOR 4
13293#endif
Victor Stinner202fdca2012-05-07 12:47:02 +020013294 Py_ssize_t newlen;
13295 PyObject *newbuffer;
13296
Victor Stinnerd3f08822012-05-29 12:57:52 +020013297 assert(length > 0);
13298
Victor Stinner202fdca2012-05-07 12:47:02 +020013299 if (length > PY_SSIZE_T_MAX - writer->pos) {
13300 PyErr_NoMemory();
13301 return -1;
13302 }
13303 newlen = writer->pos + length;
13304
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013305 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013306
Victor Stinnerd3f08822012-05-29 12:57:52 +020013307 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013308 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013309 if (writer->overallocate
13310 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13311 /* overallocate to limit the number of realloc() */
13312 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013313 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013314 if (newlen < writer->min_length)
13315 newlen = writer->min_length;
13316
Victor Stinnerd3f08822012-05-29 12:57:52 +020013317 writer->buffer = PyUnicode_New(newlen, maxchar);
13318 if (writer->buffer == NULL)
13319 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013320 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013321 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013322 if (writer->overallocate
13323 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13324 /* overallocate to limit the number of realloc() */
13325 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013326 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013327 if (newlen < writer->min_length)
13328 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013329
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013330 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013331 /* resize + widen */
13332 newbuffer = PyUnicode_New(newlen, maxchar);
13333 if (newbuffer == NULL)
13334 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013335 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13336 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013337 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013338 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013339 }
13340 else {
13341 newbuffer = resize_compact(writer->buffer, newlen);
13342 if (newbuffer == NULL)
13343 return -1;
13344 }
13345 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013346 }
13347 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013348 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013349 newbuffer = PyUnicode_New(writer->size, maxchar);
13350 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013351 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013352 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13353 writer->buffer, 0, writer->pos);
13354 Py_DECREF(writer->buffer);
13355 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013356 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013357 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013358 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013359
13360#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013361}
13362
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013363Py_LOCAL_INLINE(int)
13364_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013365{
13366 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13367 return -1;
13368 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13369 writer->pos++;
13370 return 0;
13371}
13372
13373int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013374_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13375{
13376 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13377}
13378
13379int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013380_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13381{
13382 Py_UCS4 maxchar;
13383 Py_ssize_t len;
13384
13385 if (PyUnicode_READY(str) == -1)
13386 return -1;
13387 len = PyUnicode_GET_LENGTH(str);
13388 if (len == 0)
13389 return 0;
13390 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13391 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013392 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013393 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013394 Py_INCREF(str);
13395 writer->buffer = str;
13396 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013397 writer->pos += len;
13398 return 0;
13399 }
13400 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13401 return -1;
13402 }
13403 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13404 str, 0, len);
13405 writer->pos += len;
13406 return 0;
13407}
13408
Victor Stinnere215d962012-10-06 23:03:36 +020013409int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013410_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13411 Py_ssize_t start, Py_ssize_t end)
13412{
13413 Py_UCS4 maxchar;
13414 Py_ssize_t len;
13415
13416 if (PyUnicode_READY(str) == -1)
13417 return -1;
13418
13419 assert(0 <= start);
13420 assert(end <= PyUnicode_GET_LENGTH(str));
13421 assert(start <= end);
13422
13423 if (end == 0)
13424 return 0;
13425
13426 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13427 return _PyUnicodeWriter_WriteStr(writer, str);
13428
13429 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13430 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13431 else
13432 maxchar = writer->maxchar;
13433 len = end - start;
13434
13435 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13436 return -1;
13437
13438 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13439 str, start, len);
13440 writer->pos += len;
13441 return 0;
13442}
13443
13444int
Victor Stinner4a587072013-11-19 12:54:53 +010013445_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13446 const char *ascii, Py_ssize_t len)
13447{
13448 if (len == -1)
13449 len = strlen(ascii);
13450
13451 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13452
13453 if (writer->buffer == NULL && !writer->overallocate) {
13454 PyObject *str;
13455
13456 str = _PyUnicode_FromASCII(ascii, len);
13457 if (str == NULL)
13458 return -1;
13459
13460 writer->readonly = 1;
13461 writer->buffer = str;
13462 _PyUnicodeWriter_Update(writer);
13463 writer->pos += len;
13464 return 0;
13465 }
13466
13467 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13468 return -1;
13469
13470 switch (writer->kind)
13471 {
13472 case PyUnicode_1BYTE_KIND:
13473 {
13474 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13475 Py_UCS1 *data = writer->data;
13476
13477 Py_MEMCPY(data + writer->pos, str, len);
13478 break;
13479 }
13480 case PyUnicode_2BYTE_KIND:
13481 {
13482 _PyUnicode_CONVERT_BYTES(
13483 Py_UCS1, Py_UCS2,
13484 ascii, ascii + len,
13485 (Py_UCS2 *)writer->data + writer->pos);
13486 break;
13487 }
13488 case PyUnicode_4BYTE_KIND:
13489 {
13490 _PyUnicode_CONVERT_BYTES(
13491 Py_UCS1, Py_UCS4,
13492 ascii, ascii + len,
13493 (Py_UCS4 *)writer->data + writer->pos);
13494 break;
13495 }
13496 default:
13497 assert(0);
13498 }
13499
13500 writer->pos += len;
13501 return 0;
13502}
13503
13504int
13505_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13506 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013507{
13508 Py_UCS4 maxchar;
13509
13510 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13511 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13512 return -1;
13513 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13514 writer->pos += len;
13515 return 0;
13516}
13517
Victor Stinnerd3f08822012-05-29 12:57:52 +020013518PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013519_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013520{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013521 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013522 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013523 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013524 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013525 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013526 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013527 str = writer->buffer;
13528 writer->buffer = NULL;
13529 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13530 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013531 }
13532 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13533 PyObject *newbuffer;
13534 newbuffer = resize_compact(writer->buffer, writer->pos);
13535 if (newbuffer == NULL) {
Serhiy Storchakadfe98a12014-02-09 13:46:20 +020013536 Py_CLEAR(writer->buffer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013537 return NULL;
13538 }
13539 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013540 }
Victor Stinner15a0bd32013-07-08 22:29:55 +020013541 str = writer->buffer;
13542 writer->buffer = NULL;
13543 assert(_PyUnicode_CheckConsistency(str, 1));
13544 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013545}
13546
Victor Stinnerd3f08822012-05-29 12:57:52 +020013547void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013548_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013549{
13550 Py_CLEAR(writer->buffer);
13551}
13552
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013553#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013554
13555PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013556 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013557\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013558Return a formatted version of S, using substitutions from args and kwargs.\n\
13559The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013560
Eric Smith27bbca62010-11-04 17:06:58 +000013561PyDoc_STRVAR(format_map__doc__,
13562 "S.format_map(mapping) -> str\n\
13563\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013564Return a formatted version of S, using substitutions from mapping.\n\
13565The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013566
Eric Smith4a7d76d2008-05-30 18:10:19 +000013567static PyObject *
13568unicode__format__(PyObject* self, PyObject* args)
13569{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013570 PyObject *format_spec;
13571 _PyUnicodeWriter writer;
13572 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013573
13574 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13575 return NULL;
13576
Victor Stinnerd3f08822012-05-29 12:57:52 +020013577 if (PyUnicode_READY(self) == -1)
13578 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013579 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013580 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13581 self, format_spec, 0,
13582 PyUnicode_GET_LENGTH(format_spec));
13583 if (ret == -1) {
13584 _PyUnicodeWriter_Dealloc(&writer);
13585 return NULL;
13586 }
13587 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013588}
13589
Eric Smith8c663262007-08-25 02:26:07 +000013590PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013591 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013592\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013593Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013594
13595static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013596unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013597{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013598 Py_ssize_t size;
13599
13600 /* If it's a compact object, account for base structure +
13601 character data. */
13602 if (PyUnicode_IS_COMPACT_ASCII(v))
13603 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13604 else if (PyUnicode_IS_COMPACT(v))
13605 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013606 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013607 else {
13608 /* If it is a two-block object, account for base object, and
13609 for character block if present. */
13610 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013611 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013612 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013613 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013614 }
13615 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013616 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013617 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013618 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013619 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013620 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013621
13622 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013623}
13624
13625PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013626 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013627
13628static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013629unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013630{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013631 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013632 if (!copy)
13633 return NULL;
13634 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013635}
13636
Guido van Rossumd57fd912000-03-10 22:53:23 +000013637static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013638 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013639 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013640 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13641 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013642 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13643 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013644 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013645 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13646 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13647 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013648 {"expandtabs", (PyCFunction) unicode_expandtabs,
13649 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013650 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013651 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013652 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13653 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13654 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013655 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013656 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13657 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13658 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013659 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013660 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013661 {"splitlines", (PyCFunction) unicode_splitlines,
13662 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013663 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013664 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13665 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13666 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13667 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13668 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13669 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13670 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13671 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13672 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13673 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13674 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13675 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13676 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13677 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013678 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013679 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013680 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013681 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013682 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013683 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013684 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013685 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013686#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013687 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013688 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013689#endif
13690
Benjamin Peterson14339b62009-01-31 16:36:08 +000013691 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013692 {NULL, NULL}
13693};
13694
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013695static PyObject *
13696unicode_mod(PyObject *v, PyObject *w)
13697{
Brian Curtindfc80e32011-08-10 20:28:54 -050013698 if (!PyUnicode_Check(v))
13699 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013700 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013701}
13702
13703static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013704 0, /*nb_add*/
13705 0, /*nb_subtract*/
13706 0, /*nb_multiply*/
13707 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013708};
13709
Guido van Rossumd57fd912000-03-10 22:53:23 +000013710static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013711 (lenfunc) unicode_length, /* sq_length */
13712 PyUnicode_Concat, /* sq_concat */
13713 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13714 (ssizeargfunc) unicode_getitem, /* sq_item */
13715 0, /* sq_slice */
13716 0, /* sq_ass_item */
13717 0, /* sq_ass_slice */
13718 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013719};
13720
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013721static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013722unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013723{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013724 if (PyUnicode_READY(self) == -1)
13725 return NULL;
13726
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013727 if (PyIndex_Check(item)) {
13728 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013729 if (i == -1 && PyErr_Occurred())
13730 return NULL;
13731 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013732 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013733 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013734 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013735 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013736 PyObject *result;
13737 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013738 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013739 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013740
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013741 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013742 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013743 return NULL;
13744 }
13745
13746 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013747 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013748 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013749 slicelength == PyUnicode_GET_LENGTH(self)) {
13750 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013751 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013752 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013753 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013754 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013755 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013756 src_kind = PyUnicode_KIND(self);
13757 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013758 if (!PyUnicode_IS_ASCII(self)) {
13759 kind_limit = kind_maxchar_limit(src_kind);
13760 max_char = 0;
13761 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13762 ch = PyUnicode_READ(src_kind, src_data, cur);
13763 if (ch > max_char) {
13764 max_char = ch;
13765 if (max_char >= kind_limit)
13766 break;
13767 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013768 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013769 }
Victor Stinner55c99112011-10-13 01:17:06 +020013770 else
13771 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013772 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013773 if (result == NULL)
13774 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013775 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013776 dest_data = PyUnicode_DATA(result);
13777
13778 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013779 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13780 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013781 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013782 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013783 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013784 } else {
13785 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13786 return NULL;
13787 }
13788}
13789
13790static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013791 (lenfunc)unicode_length, /* mp_length */
13792 (binaryfunc)unicode_subscript, /* mp_subscript */
13793 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013794};
13795
Guido van Rossumd57fd912000-03-10 22:53:23 +000013796
Guido van Rossumd57fd912000-03-10 22:53:23 +000013797/* Helpers for PyUnicode_Format() */
13798
Victor Stinnera47082312012-10-04 02:19:54 +020013799struct unicode_formatter_t {
13800 PyObject *args;
13801 int args_owned;
13802 Py_ssize_t arglen, argidx;
13803 PyObject *dict;
13804
13805 enum PyUnicode_Kind fmtkind;
13806 Py_ssize_t fmtcnt, fmtpos;
13807 void *fmtdata;
13808 PyObject *fmtstr;
13809
13810 _PyUnicodeWriter writer;
13811};
13812
13813struct unicode_format_arg_t {
13814 Py_UCS4 ch;
13815 int flags;
13816 Py_ssize_t width;
13817 int prec;
13818 int sign;
13819};
13820
Guido van Rossumd57fd912000-03-10 22:53:23 +000013821static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013822unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013823{
Victor Stinnera47082312012-10-04 02:19:54 +020013824 Py_ssize_t argidx = ctx->argidx;
13825
13826 if (argidx < ctx->arglen) {
13827 ctx->argidx++;
13828 if (ctx->arglen < 0)
13829 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013830 else
Victor Stinnera47082312012-10-04 02:19:54 +020013831 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013832 }
13833 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013834 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013835 return NULL;
13836}
13837
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013838/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013839
Victor Stinnera47082312012-10-04 02:19:54 +020013840/* Format a float into the writer if the writer is not NULL, or into *p_output
13841 otherwise.
13842
13843 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013844static int
Victor Stinnera47082312012-10-04 02:19:54 +020013845formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13846 PyObject **p_output,
13847 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013848{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013849 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013850 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013851 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013852 int prec;
13853 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013854
Guido van Rossumd57fd912000-03-10 22:53:23 +000013855 x = PyFloat_AsDouble(v);
13856 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013857 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013858
Victor Stinnera47082312012-10-04 02:19:54 +020013859 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013860 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013861 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013862
Victor Stinnera47082312012-10-04 02:19:54 +020013863 if (arg->flags & F_ALT)
13864 dtoa_flags = Py_DTSF_ALT;
13865 else
13866 dtoa_flags = 0;
13867 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013868 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013869 return -1;
13870 len = strlen(p);
13871 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013872 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013873 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013874 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013875 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013876 }
13877 else
13878 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013879 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013880 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013881}
13882
Victor Stinnerd0880d52012-04-27 23:40:13 +020013883/* formatlong() emulates the format codes d, u, o, x and X, and
13884 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13885 * Python's regular ints.
13886 * Return value: a new PyUnicodeObject*, or NULL if error.
13887 * The output string is of the form
13888 * "-"? ("0x" | "0X")? digit+
13889 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13890 * set in flags. The case of hex digits will be correct,
13891 * There will be at least prec digits, zero-filled on the left if
13892 * necessary to get that many.
13893 * val object to be converted
13894 * flags bitmask of format flags; only F_ALT is looked at
13895 * prec minimum number of digits; 0-fill on left if needed
13896 * type a character in [duoxX]; u acts the same as d
13897 *
13898 * CAUTION: o, x and X conversions on regular ints can never
13899 * produce a '-' sign, but can for Python's unbounded ints.
13900 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013901PyObject *
13902_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000013903{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013904 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013905 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013906 Py_ssize_t i;
13907 int sign; /* 1 if '-', else 0 */
13908 int len; /* number of characters */
13909 Py_ssize_t llen;
13910 int numdigits; /* len == numnondigits + numdigits */
13911 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013912
Victor Stinnerd0880d52012-04-27 23:40:13 +020013913 /* Avoid exceeding SSIZE_T_MAX */
13914 if (prec > INT_MAX-3) {
13915 PyErr_SetString(PyExc_OverflowError,
13916 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013917 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013918 }
13919
13920 assert(PyLong_Check(val));
13921
13922 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013923 default:
13924 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013925 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013926 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013927 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013928 /* int and int subclasses should print numerically when a numeric */
13929 /* format code is used (see issue18780) */
13930 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013931 break;
13932 case 'o':
13933 numnondigits = 2;
13934 result = PyNumber_ToBase(val, 8);
13935 break;
13936 case 'x':
13937 case 'X':
13938 numnondigits = 2;
13939 result = PyNumber_ToBase(val, 16);
13940 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013941 }
13942 if (!result)
13943 return NULL;
13944
13945 assert(unicode_modifiable(result));
13946 assert(PyUnicode_IS_READY(result));
13947 assert(PyUnicode_IS_ASCII(result));
13948
13949 /* To modify the string in-place, there can only be one reference. */
13950 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013951 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013952 PyErr_BadInternalCall();
13953 return NULL;
13954 }
13955 buf = PyUnicode_DATA(result);
13956 llen = PyUnicode_GET_LENGTH(result);
13957 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013958 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013959 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080013960 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013961 return NULL;
13962 }
13963 len = (int)llen;
13964 sign = buf[0] == '-';
13965 numnondigits += sign;
13966 numdigits = len - numnondigits;
13967 assert(numdigits > 0);
13968
13969 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013970 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013971 (type == 'o' || type == 'x' || type == 'X'))) {
13972 assert(buf[sign] == '0');
13973 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13974 buf[sign+1] == 'o');
13975 numnondigits -= 2;
13976 buf += 2;
13977 len -= 2;
13978 if (sign)
13979 buf[0] = '-';
13980 assert(len == numnondigits + numdigits);
13981 assert(numdigits > 0);
13982 }
13983
13984 /* Fill with leading zeroes to meet minimum width. */
13985 if (prec > numdigits) {
13986 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13987 numnondigits + prec);
13988 char *b1;
13989 if (!r1) {
13990 Py_DECREF(result);
13991 return NULL;
13992 }
13993 b1 = PyBytes_AS_STRING(r1);
13994 for (i = 0; i < numnondigits; ++i)
13995 *b1++ = *buf++;
13996 for (i = 0; i < prec - numdigits; i++)
13997 *b1++ = '0';
13998 for (i = 0; i < numdigits; i++)
13999 *b1++ = *buf++;
14000 *b1 = '\0';
14001 Py_DECREF(result);
14002 result = r1;
14003 buf = PyBytes_AS_STRING(result);
14004 len = numnondigits + prec;
14005 }
14006
14007 /* Fix up case for hex conversions. */
14008 if (type == 'X') {
14009 /* Need to convert all lower case letters to upper case.
14010 and need to convert 0x to 0X (and -0x to -0X). */
14011 for (i = 0; i < len; i++)
14012 if (buf[i] >= 'a' && buf[i] <= 'x')
14013 buf[i] -= 'a'-'A';
14014 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014015 if (!PyUnicode_Check(result)
14016 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014017 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014018 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014019 Py_DECREF(result);
14020 result = unicode;
14021 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014022 else if (len != PyUnicode_GET_LENGTH(result)) {
14023 if (PyUnicode_Resize(&result, len) < 0)
14024 Py_CLEAR(result);
14025 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014026 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014027}
14028
Ethan Furmandf3ed242014-01-05 06:50:30 -080014029/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014030 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014031 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014032 * -1 and raise an exception on error */
14033static int
Victor Stinnera47082312012-10-04 02:19:54 +020014034mainformatlong(PyObject *v,
14035 struct unicode_format_arg_t *arg,
14036 PyObject **p_output,
14037 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014038{
14039 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014040 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014041
14042 if (!PyNumber_Check(v))
14043 goto wrongtype;
14044
Ethan Furman9ab74802014-03-21 06:38:46 -070014045 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014046 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014047 if (type == 'o' || type == 'x' || type == 'X') {
14048 iobj = PyNumber_Index(v);
14049 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014050 if (PyErr_ExceptionMatches(PyExc_TypeError))
14051 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014052 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014053 }
14054 }
14055 else {
14056 iobj = PyNumber_Long(v);
14057 if (iobj == NULL ) {
14058 if (PyErr_ExceptionMatches(PyExc_TypeError))
14059 goto wrongtype;
14060 return -1;
14061 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014062 }
14063 assert(PyLong_Check(iobj));
14064 }
14065 else {
14066 iobj = v;
14067 Py_INCREF(iobj);
14068 }
14069
14070 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014071 && arg->width == -1 && arg->prec == -1
14072 && !(arg->flags & (F_SIGN | F_BLANK))
14073 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014074 {
14075 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014076 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014077 int base;
14078
Victor Stinnera47082312012-10-04 02:19:54 +020014079 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014080 {
14081 default:
14082 assert(0 && "'type' not in [diuoxX]");
14083 case 'd':
14084 case 'i':
14085 case 'u':
14086 base = 10;
14087 break;
14088 case 'o':
14089 base = 8;
14090 break;
14091 case 'x':
14092 case 'X':
14093 base = 16;
14094 break;
14095 }
14096
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014097 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14098 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014099 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014100 }
14101 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014102 return 1;
14103 }
14104
Ethan Furmanb95b5612015-01-23 20:05:18 -080014105 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014106 Py_DECREF(iobj);
14107 if (res == NULL)
14108 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014109 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014110 return 0;
14111
14112wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014113 switch(type)
14114 {
14115 case 'o':
14116 case 'x':
14117 case 'X':
14118 PyErr_Format(PyExc_TypeError,
14119 "%%%c format: an integer is required, "
14120 "not %.200s",
14121 type, Py_TYPE(v)->tp_name);
14122 break;
14123 default:
14124 PyErr_Format(PyExc_TypeError,
14125 "%%%c format: a number is required, "
14126 "not %.200s",
14127 type, Py_TYPE(v)->tp_name);
14128 break;
14129 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014130 return -1;
14131}
14132
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014133static Py_UCS4
14134formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014135{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014136 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014137 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014138 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014139 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014140 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014141 goto onError;
14142 }
14143 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014144 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014145 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014146 /* make sure number is a type of integer */
14147 if (!PyLong_Check(v)) {
14148 iobj = PyNumber_Index(v);
14149 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014150 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014151 }
14152 v = iobj;
14153 Py_DECREF(iobj);
14154 }
14155 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014156 x = PyLong_AsLong(v);
14157 if (x == -1 && PyErr_Occurred())
14158 goto onError;
14159
Victor Stinner8faf8212011-12-08 22:14:11 +010014160 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014161 PyErr_SetString(PyExc_OverflowError,
14162 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014163 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014164 }
14165
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014166 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014167 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014168
Benjamin Peterson29060642009-01-31 22:14:21 +000014169 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014170 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014171 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014172 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014173}
14174
Victor Stinnera47082312012-10-04 02:19:54 +020014175/* Parse options of an argument: flags, width, precision.
14176 Handle also "%(name)" syntax.
14177
14178 Return 0 if the argument has been formatted into arg->str.
14179 Return 1 if the argument has been written into ctx->writer,
14180 Raise an exception and return -1 on error. */
14181static int
14182unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14183 struct unicode_format_arg_t *arg)
14184{
14185#define FORMAT_READ(ctx) \
14186 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14187
14188 PyObject *v;
14189
Victor Stinnera47082312012-10-04 02:19:54 +020014190 if (arg->ch == '(') {
14191 /* Get argument value from a dictionary. Example: "%(name)s". */
14192 Py_ssize_t keystart;
14193 Py_ssize_t keylen;
14194 PyObject *key;
14195 int pcount = 1;
14196
14197 if (ctx->dict == NULL) {
14198 PyErr_SetString(PyExc_TypeError,
14199 "format requires a mapping");
14200 return -1;
14201 }
14202 ++ctx->fmtpos;
14203 --ctx->fmtcnt;
14204 keystart = ctx->fmtpos;
14205 /* Skip over balanced parentheses */
14206 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14207 arg->ch = FORMAT_READ(ctx);
14208 if (arg->ch == ')')
14209 --pcount;
14210 else if (arg->ch == '(')
14211 ++pcount;
14212 ctx->fmtpos++;
14213 }
14214 keylen = ctx->fmtpos - keystart - 1;
14215 if (ctx->fmtcnt < 0 || pcount > 0) {
14216 PyErr_SetString(PyExc_ValueError,
14217 "incomplete format key");
14218 return -1;
14219 }
14220 key = PyUnicode_Substring(ctx->fmtstr,
14221 keystart, keystart + keylen);
14222 if (key == NULL)
14223 return -1;
14224 if (ctx->args_owned) {
14225 Py_DECREF(ctx->args);
14226 ctx->args_owned = 0;
14227 }
14228 ctx->args = PyObject_GetItem(ctx->dict, key);
14229 Py_DECREF(key);
14230 if (ctx->args == NULL)
14231 return -1;
14232 ctx->args_owned = 1;
14233 ctx->arglen = -1;
14234 ctx->argidx = -2;
14235 }
14236
14237 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014238 while (--ctx->fmtcnt >= 0) {
14239 arg->ch = FORMAT_READ(ctx);
14240 ctx->fmtpos++;
14241 switch (arg->ch) {
14242 case '-': arg->flags |= F_LJUST; continue;
14243 case '+': arg->flags |= F_SIGN; continue;
14244 case ' ': arg->flags |= F_BLANK; continue;
14245 case '#': arg->flags |= F_ALT; continue;
14246 case '0': arg->flags |= F_ZERO; continue;
14247 }
14248 break;
14249 }
14250
14251 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014252 if (arg->ch == '*') {
14253 v = unicode_format_getnextarg(ctx);
14254 if (v == NULL)
14255 return -1;
14256 if (!PyLong_Check(v)) {
14257 PyErr_SetString(PyExc_TypeError,
14258 "* wants int");
14259 return -1;
14260 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014261 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014262 if (arg->width == -1 && PyErr_Occurred())
14263 return -1;
14264 if (arg->width < 0) {
14265 arg->flags |= F_LJUST;
14266 arg->width = -arg->width;
14267 }
14268 if (--ctx->fmtcnt >= 0) {
14269 arg->ch = FORMAT_READ(ctx);
14270 ctx->fmtpos++;
14271 }
14272 }
14273 else if (arg->ch >= '0' && arg->ch <= '9') {
14274 arg->width = arg->ch - '0';
14275 while (--ctx->fmtcnt >= 0) {
14276 arg->ch = FORMAT_READ(ctx);
14277 ctx->fmtpos++;
14278 if (arg->ch < '0' || arg->ch > '9')
14279 break;
14280 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14281 mixing signed and unsigned comparison. Since arg->ch is between
14282 '0' and '9', casting to int is safe. */
14283 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14284 PyErr_SetString(PyExc_ValueError,
14285 "width too big");
14286 return -1;
14287 }
14288 arg->width = arg->width*10 + (arg->ch - '0');
14289 }
14290 }
14291
14292 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014293 if (arg->ch == '.') {
14294 arg->prec = 0;
14295 if (--ctx->fmtcnt >= 0) {
14296 arg->ch = FORMAT_READ(ctx);
14297 ctx->fmtpos++;
14298 }
14299 if (arg->ch == '*') {
14300 v = unicode_format_getnextarg(ctx);
14301 if (v == NULL)
14302 return -1;
14303 if (!PyLong_Check(v)) {
14304 PyErr_SetString(PyExc_TypeError,
14305 "* wants int");
14306 return -1;
14307 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014308 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014309 if (arg->prec == -1 && PyErr_Occurred())
14310 return -1;
14311 if (arg->prec < 0)
14312 arg->prec = 0;
14313 if (--ctx->fmtcnt >= 0) {
14314 arg->ch = FORMAT_READ(ctx);
14315 ctx->fmtpos++;
14316 }
14317 }
14318 else if (arg->ch >= '0' && arg->ch <= '9') {
14319 arg->prec = arg->ch - '0';
14320 while (--ctx->fmtcnt >= 0) {
14321 arg->ch = FORMAT_READ(ctx);
14322 ctx->fmtpos++;
14323 if (arg->ch < '0' || arg->ch > '9')
14324 break;
14325 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14326 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014327 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014328 return -1;
14329 }
14330 arg->prec = arg->prec*10 + (arg->ch - '0');
14331 }
14332 }
14333 }
14334
14335 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14336 if (ctx->fmtcnt >= 0) {
14337 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14338 if (--ctx->fmtcnt >= 0) {
14339 arg->ch = FORMAT_READ(ctx);
14340 ctx->fmtpos++;
14341 }
14342 }
14343 }
14344 if (ctx->fmtcnt < 0) {
14345 PyErr_SetString(PyExc_ValueError,
14346 "incomplete format");
14347 return -1;
14348 }
14349 return 0;
14350
14351#undef FORMAT_READ
14352}
14353
14354/* Format one argument. Supported conversion specifiers:
14355
14356 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014357 - "i", "d", "u": int or float
14358 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014359 - "e", "E", "f", "F", "g", "G": float
14360 - "c": int or str (1 character)
14361
Victor Stinner8dbd4212012-12-04 09:30:24 +010014362 When possible, the output is written directly into the Unicode writer
14363 (ctx->writer). A string is created when padding is required.
14364
Victor Stinnera47082312012-10-04 02:19:54 +020014365 Return 0 if the argument has been formatted into *p_str,
14366 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014367 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014368static int
14369unicode_format_arg_format(struct unicode_formatter_t *ctx,
14370 struct unicode_format_arg_t *arg,
14371 PyObject **p_str)
14372{
14373 PyObject *v;
14374 _PyUnicodeWriter *writer = &ctx->writer;
14375
14376 if (ctx->fmtcnt == 0)
14377 ctx->writer.overallocate = 0;
14378
14379 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014380 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014381 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014382 return 1;
14383 }
14384
14385 v = unicode_format_getnextarg(ctx);
14386 if (v == NULL)
14387 return -1;
14388
Victor Stinnera47082312012-10-04 02:19:54 +020014389
14390 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014391 case 's':
14392 case 'r':
14393 case 'a':
14394 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14395 /* Fast path */
14396 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14397 return -1;
14398 return 1;
14399 }
14400
14401 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14402 *p_str = v;
14403 Py_INCREF(*p_str);
14404 }
14405 else {
14406 if (arg->ch == 's')
14407 *p_str = PyObject_Str(v);
14408 else if (arg->ch == 'r')
14409 *p_str = PyObject_Repr(v);
14410 else
14411 *p_str = PyObject_ASCII(v);
14412 }
14413 break;
14414
14415 case 'i':
14416 case 'd':
14417 case 'u':
14418 case 'o':
14419 case 'x':
14420 case 'X':
14421 {
14422 int ret = mainformatlong(v, arg, p_str, writer);
14423 if (ret != 0)
14424 return ret;
14425 arg->sign = 1;
14426 break;
14427 }
14428
14429 case 'e':
14430 case 'E':
14431 case 'f':
14432 case 'F':
14433 case 'g':
14434 case 'G':
14435 if (arg->width == -1 && arg->prec == -1
14436 && !(arg->flags & (F_SIGN | F_BLANK)))
14437 {
14438 /* Fast path */
14439 if (formatfloat(v, arg, NULL, writer) == -1)
14440 return -1;
14441 return 1;
14442 }
14443
14444 arg->sign = 1;
14445 if (formatfloat(v, arg, p_str, NULL) == -1)
14446 return -1;
14447 break;
14448
14449 case 'c':
14450 {
14451 Py_UCS4 ch = formatchar(v);
14452 if (ch == (Py_UCS4) -1)
14453 return -1;
14454 if (arg->width == -1 && arg->prec == -1) {
14455 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014456 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014457 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014458 return 1;
14459 }
14460 *p_str = PyUnicode_FromOrdinal(ch);
14461 break;
14462 }
14463
14464 default:
14465 PyErr_Format(PyExc_ValueError,
14466 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014467 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014468 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14469 (int)arg->ch,
14470 ctx->fmtpos - 1);
14471 return -1;
14472 }
14473 if (*p_str == NULL)
14474 return -1;
14475 assert (PyUnicode_Check(*p_str));
14476 return 0;
14477}
14478
14479static int
14480unicode_format_arg_output(struct unicode_formatter_t *ctx,
14481 struct unicode_format_arg_t *arg,
14482 PyObject *str)
14483{
14484 Py_ssize_t len;
14485 enum PyUnicode_Kind kind;
14486 void *pbuf;
14487 Py_ssize_t pindex;
14488 Py_UCS4 signchar;
14489 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014490 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014491 Py_ssize_t sublen;
14492 _PyUnicodeWriter *writer = &ctx->writer;
14493 Py_UCS4 fill;
14494
14495 fill = ' ';
14496 if (arg->sign && arg->flags & F_ZERO)
14497 fill = '0';
14498
14499 if (PyUnicode_READY(str) == -1)
14500 return -1;
14501
14502 len = PyUnicode_GET_LENGTH(str);
14503 if ((arg->width == -1 || arg->width <= len)
14504 && (arg->prec == -1 || arg->prec >= len)
14505 && !(arg->flags & (F_SIGN | F_BLANK)))
14506 {
14507 /* Fast path */
14508 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14509 return -1;
14510 return 0;
14511 }
14512
14513 /* Truncate the string for "s", "r" and "a" formats
14514 if the precision is set */
14515 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14516 if (arg->prec >= 0 && len > arg->prec)
14517 len = arg->prec;
14518 }
14519
14520 /* Adjust sign and width */
14521 kind = PyUnicode_KIND(str);
14522 pbuf = PyUnicode_DATA(str);
14523 pindex = 0;
14524 signchar = '\0';
14525 if (arg->sign) {
14526 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14527 if (ch == '-' || ch == '+') {
14528 signchar = ch;
14529 len--;
14530 pindex++;
14531 }
14532 else if (arg->flags & F_SIGN)
14533 signchar = '+';
14534 else if (arg->flags & F_BLANK)
14535 signchar = ' ';
14536 else
14537 arg->sign = 0;
14538 }
14539 if (arg->width < len)
14540 arg->width = len;
14541
14542 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014543 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014544 if (!(arg->flags & F_LJUST)) {
14545 if (arg->sign) {
14546 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014547 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014548 }
14549 else {
14550 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014551 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014552 }
14553 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014554 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14555 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014556 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014557 }
14558
Victor Stinnera47082312012-10-04 02:19:54 +020014559 buflen = arg->width;
14560 if (arg->sign && len == arg->width)
14561 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014562 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014563 return -1;
14564
14565 /* Write the sign if needed */
14566 if (arg->sign) {
14567 if (fill != ' ') {
14568 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14569 writer->pos += 1;
14570 }
14571 if (arg->width > len)
14572 arg->width--;
14573 }
14574
14575 /* Write the numeric prefix for "x", "X" and "o" formats
14576 if the alternate form is used.
14577 For example, write "0x" for the "%#x" format. */
14578 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14579 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14580 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14581 if (fill != ' ') {
14582 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14583 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14584 writer->pos += 2;
14585 pindex += 2;
14586 }
14587 arg->width -= 2;
14588 if (arg->width < 0)
14589 arg->width = 0;
14590 len -= 2;
14591 }
14592
14593 /* Pad left with the fill character if needed */
14594 if (arg->width > len && !(arg->flags & F_LJUST)) {
14595 sublen = arg->width - len;
14596 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14597 writer->pos += sublen;
14598 arg->width = len;
14599 }
14600
14601 /* If padding with spaces: write sign if needed and/or numeric prefix if
14602 the alternate form is used */
14603 if (fill == ' ') {
14604 if (arg->sign) {
14605 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14606 writer->pos += 1;
14607 }
14608 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14609 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14610 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14611 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14612 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14613 writer->pos += 2;
14614 pindex += 2;
14615 }
14616 }
14617
14618 /* Write characters */
14619 if (len) {
14620 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14621 str, pindex, len);
14622 writer->pos += len;
14623 }
14624
14625 /* Pad right with the fill character if needed */
14626 if (arg->width > len) {
14627 sublen = arg->width - len;
14628 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14629 writer->pos += sublen;
14630 }
14631 return 0;
14632}
14633
14634/* Helper of PyUnicode_Format(): format one arg.
14635 Return 0 on success, raise an exception and return -1 on error. */
14636static int
14637unicode_format_arg(struct unicode_formatter_t *ctx)
14638{
14639 struct unicode_format_arg_t arg;
14640 PyObject *str;
14641 int ret;
14642
Victor Stinner8dbd4212012-12-04 09:30:24 +010014643 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14644 arg.flags = 0;
14645 arg.width = -1;
14646 arg.prec = -1;
14647 arg.sign = 0;
14648 str = NULL;
14649
Victor Stinnera47082312012-10-04 02:19:54 +020014650 ret = unicode_format_arg_parse(ctx, &arg);
14651 if (ret == -1)
14652 return -1;
14653
14654 ret = unicode_format_arg_format(ctx, &arg, &str);
14655 if (ret == -1)
14656 return -1;
14657
14658 if (ret != 1) {
14659 ret = unicode_format_arg_output(ctx, &arg, str);
14660 Py_DECREF(str);
14661 if (ret == -1)
14662 return -1;
14663 }
14664
14665 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14666 PyErr_SetString(PyExc_TypeError,
14667 "not all arguments converted during string formatting");
14668 return -1;
14669 }
14670 return 0;
14671}
14672
Alexander Belopolsky40018472011-02-26 01:02:56 +000014673PyObject *
14674PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014675{
Victor Stinnera47082312012-10-04 02:19:54 +020014676 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014677
Guido van Rossumd57fd912000-03-10 22:53:23 +000014678 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014679 PyErr_BadInternalCall();
14680 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014681 }
Victor Stinnera47082312012-10-04 02:19:54 +020014682
14683 ctx.fmtstr = PyUnicode_FromObject(format);
14684 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014685 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014686 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14687 Py_DECREF(ctx.fmtstr);
14688 return NULL;
14689 }
14690 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14691 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14692 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14693 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014694
Victor Stinner8f674cc2013-04-17 23:02:17 +020014695 _PyUnicodeWriter_Init(&ctx.writer);
14696 ctx.writer.min_length = ctx.fmtcnt + 100;
14697 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014698
Guido van Rossumd57fd912000-03-10 22:53:23 +000014699 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014700 ctx.arglen = PyTuple_Size(args);
14701 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014702 }
14703 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014704 ctx.arglen = -1;
14705 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014706 }
Victor Stinnera47082312012-10-04 02:19:54 +020014707 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014708 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014709 ctx.dict = args;
14710 else
14711 ctx.dict = NULL;
14712 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014713
Victor Stinnera47082312012-10-04 02:19:54 +020014714 while (--ctx.fmtcnt >= 0) {
14715 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014716 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014717
14718 nonfmtpos = ctx.fmtpos++;
14719 while (ctx.fmtcnt >= 0 &&
14720 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14721 ctx.fmtpos++;
14722 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014723 }
Victor Stinnera47082312012-10-04 02:19:54 +020014724 if (ctx.fmtcnt < 0) {
14725 ctx.fmtpos--;
14726 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014727 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014728
Victor Stinnercfc4c132013-04-03 01:48:39 +020014729 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14730 nonfmtpos, ctx.fmtpos) < 0)
14731 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014732 }
14733 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014734 ctx.fmtpos++;
14735 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014736 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014737 }
14738 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014739
Victor Stinnera47082312012-10-04 02:19:54 +020014740 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014741 PyErr_SetString(PyExc_TypeError,
14742 "not all arguments converted during string formatting");
14743 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014744 }
14745
Victor Stinnera47082312012-10-04 02:19:54 +020014746 if (ctx.args_owned) {
14747 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014748 }
Victor Stinnera47082312012-10-04 02:19:54 +020014749 Py_DECREF(ctx.fmtstr);
14750 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014751
Benjamin Peterson29060642009-01-31 22:14:21 +000014752 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014753 Py_DECREF(ctx.fmtstr);
14754 _PyUnicodeWriter_Dealloc(&ctx.writer);
14755 if (ctx.args_owned) {
14756 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014757 }
14758 return NULL;
14759}
14760
Jeremy Hylton938ace62002-07-17 16:30:39 +000014761static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014762unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14763
Tim Peters6d6c1a32001-08-02 04:15:00 +000014764static PyObject *
14765unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14766{
Benjamin Peterson29060642009-01-31 22:14:21 +000014767 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014768 static char *kwlist[] = {"object", "encoding", "errors", 0};
14769 char *encoding = NULL;
14770 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014771
Benjamin Peterson14339b62009-01-31 16:36:08 +000014772 if (type != &PyUnicode_Type)
14773 return unicode_subtype_new(type, args, kwds);
14774 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014775 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014776 return NULL;
14777 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014778 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014779 if (encoding == NULL && errors == NULL)
14780 return PyObject_Str(x);
14781 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014782 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014783}
14784
Guido van Rossume023fe02001-08-30 03:12:59 +000014785static PyObject *
14786unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14787{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014788 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014789 Py_ssize_t length, char_size;
14790 int share_wstr, share_utf8;
14791 unsigned int kind;
14792 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014793
Benjamin Peterson14339b62009-01-31 16:36:08 +000014794 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014795
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014796 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014797 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014798 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014799 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014800 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014801 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014802 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014803 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014804
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014805 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014806 if (self == NULL) {
14807 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014808 return NULL;
14809 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014810 kind = PyUnicode_KIND(unicode);
14811 length = PyUnicode_GET_LENGTH(unicode);
14812
14813 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014814#ifdef Py_DEBUG
14815 _PyUnicode_HASH(self) = -1;
14816#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014817 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014818#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014819 _PyUnicode_STATE(self).interned = 0;
14820 _PyUnicode_STATE(self).kind = kind;
14821 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014822 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014823 _PyUnicode_STATE(self).ready = 1;
14824 _PyUnicode_WSTR(self) = NULL;
14825 _PyUnicode_UTF8_LENGTH(self) = 0;
14826 _PyUnicode_UTF8(self) = NULL;
14827 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014828 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014829
14830 share_utf8 = 0;
14831 share_wstr = 0;
14832 if (kind == PyUnicode_1BYTE_KIND) {
14833 char_size = 1;
14834 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14835 share_utf8 = 1;
14836 }
14837 else if (kind == PyUnicode_2BYTE_KIND) {
14838 char_size = 2;
14839 if (sizeof(wchar_t) == 2)
14840 share_wstr = 1;
14841 }
14842 else {
14843 assert(kind == PyUnicode_4BYTE_KIND);
14844 char_size = 4;
14845 if (sizeof(wchar_t) == 4)
14846 share_wstr = 1;
14847 }
14848
14849 /* Ensure we won't overflow the length. */
14850 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14851 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014852 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014853 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014854 data = PyObject_MALLOC((length + 1) * char_size);
14855 if (data == NULL) {
14856 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014857 goto onError;
14858 }
14859
Victor Stinnerc3c74152011-10-02 20:39:55 +020014860 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014861 if (share_utf8) {
14862 _PyUnicode_UTF8_LENGTH(self) = length;
14863 _PyUnicode_UTF8(self) = data;
14864 }
14865 if (share_wstr) {
14866 _PyUnicode_WSTR_LENGTH(self) = length;
14867 _PyUnicode_WSTR(self) = (wchar_t *)data;
14868 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014869
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014870 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014871 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014872 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014873#ifdef Py_DEBUG
14874 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14875#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014876 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014877 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014878
14879onError:
14880 Py_DECREF(unicode);
14881 Py_DECREF(self);
14882 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014883}
14884
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014885PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014886"str(object='') -> str\n\
14887str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014888\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014889Create a new string object from the given object. If encoding or\n\
14890errors is specified, then the object must expose a data buffer\n\
14891that will be decoded using the given encoding and error handler.\n\
14892Otherwise, returns the result of object.__str__() (if defined)\n\
14893or repr(object).\n\
14894encoding defaults to sys.getdefaultencoding().\n\
14895errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014896
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014897static PyObject *unicode_iter(PyObject *seq);
14898
Guido van Rossumd57fd912000-03-10 22:53:23 +000014899PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014900 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014901 "str", /* tp_name */
14902 sizeof(PyUnicodeObject), /* tp_size */
14903 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014904 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014905 (destructor)unicode_dealloc, /* tp_dealloc */
14906 0, /* tp_print */
14907 0, /* tp_getattr */
14908 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014909 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014910 unicode_repr, /* tp_repr */
14911 &unicode_as_number, /* tp_as_number */
14912 &unicode_as_sequence, /* tp_as_sequence */
14913 &unicode_as_mapping, /* tp_as_mapping */
14914 (hashfunc) unicode_hash, /* tp_hash*/
14915 0, /* tp_call*/
14916 (reprfunc) unicode_str, /* tp_str */
14917 PyObject_GenericGetAttr, /* tp_getattro */
14918 0, /* tp_setattro */
14919 0, /* tp_as_buffer */
14920 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014921 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014922 unicode_doc, /* tp_doc */
14923 0, /* tp_traverse */
14924 0, /* tp_clear */
14925 PyUnicode_RichCompare, /* tp_richcompare */
14926 0, /* tp_weaklistoffset */
14927 unicode_iter, /* tp_iter */
14928 0, /* tp_iternext */
14929 unicode_methods, /* tp_methods */
14930 0, /* tp_members */
14931 0, /* tp_getset */
14932 &PyBaseObject_Type, /* tp_base */
14933 0, /* tp_dict */
14934 0, /* tp_descr_get */
14935 0, /* tp_descr_set */
14936 0, /* tp_dictoffset */
14937 0, /* tp_init */
14938 0, /* tp_alloc */
14939 unicode_new, /* tp_new */
14940 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014941};
14942
14943/* Initialize the Unicode implementation */
14944
Victor Stinner3a50e702011-10-18 21:21:00 +020014945int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014946{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014947 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014948 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014949 0x000A, /* LINE FEED */
14950 0x000D, /* CARRIAGE RETURN */
14951 0x001C, /* FILE SEPARATOR */
14952 0x001D, /* GROUP SEPARATOR */
14953 0x001E, /* RECORD SEPARATOR */
14954 0x0085, /* NEXT LINE */
14955 0x2028, /* LINE SEPARATOR */
14956 0x2029, /* PARAGRAPH SEPARATOR */
14957 };
14958
Fred Drakee4315f52000-05-09 19:53:39 +000014959 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014960 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014961 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014962 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014963 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014964
Guido van Rossumcacfc072002-05-24 19:01:59 +000014965 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014966 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014967
14968 /* initialize the linebreak bloom filter */
14969 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014970 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014971 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014972
Christian Heimes26532f72013-07-20 14:57:16 +020014973 if (PyType_Ready(&EncodingMapType) < 0)
14974 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014975
Benjamin Petersonc4311282012-10-30 23:21:10 -040014976 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14977 Py_FatalError("Can't initialize field name iterator type");
14978
14979 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14980 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014981
Victor Stinner3a50e702011-10-18 21:21:00 +020014982#ifdef HAVE_MBCS
14983 winver.dwOSVersionInfoSize = sizeof(winver);
14984 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14985 PyErr_SetFromWindowsErr(0);
14986 return -1;
14987 }
14988#endif
14989 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014990}
14991
14992/* Finalize the Unicode implementation */
14993
Christian Heimesa156e092008-02-16 07:38:31 +000014994int
14995PyUnicode_ClearFreeList(void)
14996{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014997 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014998}
14999
Guido van Rossumd57fd912000-03-10 22:53:23 +000015000void
Thomas Wouters78890102000-07-22 19:25:51 +000015001_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015002{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015003 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015004
Serhiy Storchaka05997252013-01-26 12:14:02 +020015005 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015006
Serhiy Storchaka05997252013-01-26 12:14:02 +020015007 for (i = 0; i < 256; i++)
15008 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015009 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015010 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015011}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015012
Walter Dörwald16807132007-05-25 13:52:07 +000015013void
15014PyUnicode_InternInPlace(PyObject **p)
15015{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015016 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015017 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015018#ifdef Py_DEBUG
15019 assert(s != NULL);
15020 assert(_PyUnicode_CHECK(s));
15021#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015022 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015023 return;
15024#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015025 /* If it's a subclass, we don't really know what putting
15026 it in the interned dict might do. */
15027 if (!PyUnicode_CheckExact(s))
15028 return;
15029 if (PyUnicode_CHECK_INTERNED(s))
15030 return;
15031 if (interned == NULL) {
15032 interned = PyDict_New();
15033 if (interned == NULL) {
15034 PyErr_Clear(); /* Don't leave an exception */
15035 return;
15036 }
15037 }
15038 /* It might be that the GetItem call fails even
15039 though the key is present in the dictionary,
15040 namely when this happens during a stack overflow. */
15041 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010015042 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015043 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000015044
Victor Stinnerf0335102013-04-14 19:13:03 +020015045 if (t) {
15046 Py_INCREF(t);
15047 Py_DECREF(*p);
15048 *p = t;
15049 return;
15050 }
Walter Dörwald16807132007-05-25 13:52:07 +000015051
Benjamin Peterson14339b62009-01-31 16:36:08 +000015052 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010015053 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015054 PyErr_Clear();
15055 PyThreadState_GET()->recursion_critical = 0;
15056 return;
15057 }
15058 PyThreadState_GET()->recursion_critical = 0;
15059 /* The two references in interned are not counted by refcnt.
15060 The deallocator will take care of this */
15061 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015062 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015063}
15064
15065void
15066PyUnicode_InternImmortal(PyObject **p)
15067{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015068 PyUnicode_InternInPlace(p);
15069 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015070 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015071 Py_INCREF(*p);
15072 }
Walter Dörwald16807132007-05-25 13:52:07 +000015073}
15074
15075PyObject *
15076PyUnicode_InternFromString(const char *cp)
15077{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015078 PyObject *s = PyUnicode_FromString(cp);
15079 if (s == NULL)
15080 return NULL;
15081 PyUnicode_InternInPlace(&s);
15082 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015083}
15084
Alexander Belopolsky40018472011-02-26 01:02:56 +000015085void
15086_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015087{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015088 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015089 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015090 Py_ssize_t i, n;
15091 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015092
Benjamin Peterson14339b62009-01-31 16:36:08 +000015093 if (interned == NULL || !PyDict_Check(interned))
15094 return;
15095 keys = PyDict_Keys(interned);
15096 if (keys == NULL || !PyList_Check(keys)) {
15097 PyErr_Clear();
15098 return;
15099 }
Walter Dörwald16807132007-05-25 13:52:07 +000015100
Benjamin Peterson14339b62009-01-31 16:36:08 +000015101 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15102 detector, interned unicode strings are not forcibly deallocated;
15103 rather, we give them their stolen references back, and then clear
15104 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015105
Benjamin Peterson14339b62009-01-31 16:36:08 +000015106 n = PyList_GET_SIZE(keys);
15107 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015108 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015109 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015110 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015111 if (PyUnicode_READY(s) == -1) {
15112 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015113 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015114 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015115 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015116 case SSTATE_NOT_INTERNED:
15117 /* XXX Shouldn't happen */
15118 break;
15119 case SSTATE_INTERNED_IMMORTAL:
15120 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015121 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015122 break;
15123 case SSTATE_INTERNED_MORTAL:
15124 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015125 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015126 break;
15127 default:
15128 Py_FatalError("Inconsistent interned string state.");
15129 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015130 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015131 }
15132 fprintf(stderr, "total size of all interned strings: "
15133 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15134 "mortal/immortal\n", mortal_size, immortal_size);
15135 Py_DECREF(keys);
15136 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015137 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015138}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015139
15140
15141/********************* Unicode Iterator **************************/
15142
15143typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015144 PyObject_HEAD
15145 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015146 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015147} unicodeiterobject;
15148
15149static void
15150unicodeiter_dealloc(unicodeiterobject *it)
15151{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015152 _PyObject_GC_UNTRACK(it);
15153 Py_XDECREF(it->it_seq);
15154 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015155}
15156
15157static int
15158unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15159{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015160 Py_VISIT(it->it_seq);
15161 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015162}
15163
15164static PyObject *
15165unicodeiter_next(unicodeiterobject *it)
15166{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015167 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015168
Benjamin Peterson14339b62009-01-31 16:36:08 +000015169 assert(it != NULL);
15170 seq = it->it_seq;
15171 if (seq == NULL)
15172 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015173 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015174
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015175 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15176 int kind = PyUnicode_KIND(seq);
15177 void *data = PyUnicode_DATA(seq);
15178 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15179 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015180 if (item != NULL)
15181 ++it->it_index;
15182 return item;
15183 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015184
Benjamin Peterson14339b62009-01-31 16:36:08 +000015185 Py_DECREF(seq);
15186 it->it_seq = NULL;
15187 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015188}
15189
15190static PyObject *
15191unicodeiter_len(unicodeiterobject *it)
15192{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015193 Py_ssize_t len = 0;
15194 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015195 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015196 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015197}
15198
15199PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15200
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015201static PyObject *
15202unicodeiter_reduce(unicodeiterobject *it)
15203{
15204 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015205 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015206 it->it_seq, it->it_index);
15207 } else {
15208 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15209 if (u == NULL)
15210 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015211 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015212 }
15213}
15214
15215PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15216
15217static PyObject *
15218unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15219{
15220 Py_ssize_t index = PyLong_AsSsize_t(state);
15221 if (index == -1 && PyErr_Occurred())
15222 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015223 if (it->it_seq != NULL) {
15224 if (index < 0)
15225 index = 0;
15226 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15227 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15228 it->it_index = index;
15229 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015230 Py_RETURN_NONE;
15231}
15232
15233PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15234
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015235static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015236 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015237 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015238 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15239 reduce_doc},
15240 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15241 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015242 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015243};
15244
15245PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015246 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15247 "str_iterator", /* tp_name */
15248 sizeof(unicodeiterobject), /* tp_basicsize */
15249 0, /* tp_itemsize */
15250 /* methods */
15251 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15252 0, /* tp_print */
15253 0, /* tp_getattr */
15254 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015255 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015256 0, /* tp_repr */
15257 0, /* tp_as_number */
15258 0, /* tp_as_sequence */
15259 0, /* tp_as_mapping */
15260 0, /* tp_hash */
15261 0, /* tp_call */
15262 0, /* tp_str */
15263 PyObject_GenericGetAttr, /* tp_getattro */
15264 0, /* tp_setattro */
15265 0, /* tp_as_buffer */
15266 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15267 0, /* tp_doc */
15268 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15269 0, /* tp_clear */
15270 0, /* tp_richcompare */
15271 0, /* tp_weaklistoffset */
15272 PyObject_SelfIter, /* tp_iter */
15273 (iternextfunc)unicodeiter_next, /* tp_iternext */
15274 unicodeiter_methods, /* tp_methods */
15275 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015276};
15277
15278static PyObject *
15279unicode_iter(PyObject *seq)
15280{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015281 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015282
Benjamin Peterson14339b62009-01-31 16:36:08 +000015283 if (!PyUnicode_Check(seq)) {
15284 PyErr_BadInternalCall();
15285 return NULL;
15286 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015287 if (PyUnicode_READY(seq) == -1)
15288 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015289 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15290 if (it == NULL)
15291 return NULL;
15292 it->it_index = 0;
15293 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015294 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015295 _PyObject_GC_TRACK(it);
15296 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015297}
15298
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015299
15300size_t
15301Py_UNICODE_strlen(const Py_UNICODE *u)
15302{
15303 int res = 0;
15304 while(*u++)
15305 res++;
15306 return res;
15307}
15308
15309Py_UNICODE*
15310Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15311{
15312 Py_UNICODE *u = s1;
15313 while ((*u++ = *s2++));
15314 return s1;
15315}
15316
15317Py_UNICODE*
15318Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15319{
15320 Py_UNICODE *u = s1;
15321 while ((*u++ = *s2++))
15322 if (n-- == 0)
15323 break;
15324 return s1;
15325}
15326
15327Py_UNICODE*
15328Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15329{
15330 Py_UNICODE *u1 = s1;
15331 u1 += Py_UNICODE_strlen(u1);
15332 Py_UNICODE_strcpy(u1, s2);
15333 return s1;
15334}
15335
15336int
15337Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15338{
15339 while (*s1 && *s2 && *s1 == *s2)
15340 s1++, s2++;
15341 if (*s1 && *s2)
15342 return (*s1 < *s2) ? -1 : +1;
15343 if (*s1)
15344 return 1;
15345 if (*s2)
15346 return -1;
15347 return 0;
15348}
15349
15350int
15351Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15352{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015353 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015354 for (; n != 0; n--) {
15355 u1 = *s1;
15356 u2 = *s2;
15357 if (u1 != u2)
15358 return (u1 < u2) ? -1 : +1;
15359 if (u1 == '\0')
15360 return 0;
15361 s1++;
15362 s2++;
15363 }
15364 return 0;
15365}
15366
15367Py_UNICODE*
15368Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15369{
15370 const Py_UNICODE *p;
15371 for (p = s; *p; p++)
15372 if (*p == c)
15373 return (Py_UNICODE*)p;
15374 return NULL;
15375}
15376
15377Py_UNICODE*
15378Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15379{
15380 const Py_UNICODE *p;
15381 p = s + Py_UNICODE_strlen(s);
15382 while (p != s) {
15383 p--;
15384 if (*p == c)
15385 return (Py_UNICODE*)p;
15386 }
15387 return NULL;
15388}
Victor Stinner331ea922010-08-10 16:37:20 +000015389
Victor Stinner71133ff2010-09-01 23:43:53 +000015390Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015391PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015392{
Victor Stinner577db2c2011-10-11 22:12:48 +020015393 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015394 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015395
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015396 if (!PyUnicode_Check(unicode)) {
15397 PyErr_BadArgument();
15398 return NULL;
15399 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015400 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015401 if (u == NULL)
15402 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015403 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015404 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015405 PyErr_NoMemory();
15406 return NULL;
15407 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015408 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015409 size *= sizeof(Py_UNICODE);
15410 copy = PyMem_Malloc(size);
15411 if (copy == NULL) {
15412 PyErr_NoMemory();
15413 return NULL;
15414 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015415 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015416 return copy;
15417}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015418
Georg Brandl66c221e2010-10-14 07:04:07 +000015419/* A _string module, to export formatter_parser and formatter_field_name_split
15420 to the string.Formatter class implemented in Python. */
15421
15422static PyMethodDef _string_methods[] = {
15423 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15424 METH_O, PyDoc_STR("split the argument as a field name")},
15425 {"formatter_parser", (PyCFunction) formatter_parser,
15426 METH_O, PyDoc_STR("parse the argument as a format string")},
15427 {NULL, NULL}
15428};
15429
15430static struct PyModuleDef _string_module = {
15431 PyModuleDef_HEAD_INIT,
15432 "_string",
15433 PyDoc_STR("string helper module"),
15434 0,
15435 _string_methods,
15436 NULL,
15437 NULL,
15438 NULL,
15439 NULL
15440};
15441
15442PyMODINIT_FUNC
15443PyInit__string(void)
15444{
15445 return PyModule_Create(&_string_module);
15446}
15447
15448
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015449#ifdef __cplusplus
15450}
15451#endif