blob: 03f795cf06a12a32ef3063ecbc11e8619d6b72cd [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Larry Hastings61272b72014-01-07 12:41:53 -080050/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080051class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080052[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080053/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080054
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055/* --- Globals ------------------------------------------------------------
56
Serhiy Storchaka05997252013-01-26 12:14:02 +020057NOTE: In the interpreter's initialization phase, some globals are currently
58 initialized dynamically as needed. In the process Unicode objects may
59 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000060
61*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000063
64#ifdef __cplusplus
65extern "C" {
66#endif
67
Victor Stinner8faf8212011-12-08 22:14:11 +010068/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
69#define MAX_UNICODE 0x10ffff
70
Victor Stinner910337b2011-10-03 03:20:16 +020071#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020072# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020073#else
74# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
75#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020076
Victor Stinnere90fe6a2011-10-01 16:48:13 +020077#define _PyUnicode_UTF8(op) \
78 (((PyCompactUnicodeObject*)(op))->utf8)
79#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020080 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 assert(PyUnicode_IS_READY(op)), \
82 PyUnicode_IS_COMPACT_ASCII(op) ? \
83 ((char*)((PyASCIIObject*)(op) + 1)) : \
84 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020085#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020086 (((PyCompactUnicodeObject*)(op))->utf8_length)
87#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020088 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020089 assert(PyUnicode_IS_READY(op)), \
90 PyUnicode_IS_COMPACT_ASCII(op) ? \
91 ((PyASCIIObject*)(op))->length : \
92 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020093#define _PyUnicode_WSTR(op) \
94 (((PyASCIIObject*)(op))->wstr)
95#define _PyUnicode_WSTR_LENGTH(op) \
96 (((PyCompactUnicodeObject*)(op))->wstr_length)
97#define _PyUnicode_LENGTH(op) \
98 (((PyASCIIObject *)(op))->length)
99#define _PyUnicode_STATE(op) \
100 (((PyASCIIObject *)(op))->state)
101#define _PyUnicode_HASH(op) \
102 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200103#define _PyUnicode_KIND(op) \
104 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200106#define _PyUnicode_GET_LENGTH(op) \
107 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_DATA_ANY(op) \
110 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200111
Victor Stinner910337b2011-10-03 03:20:16 +0200112#undef PyUnicode_READY
113#define PyUnicode_READY(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200116 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100117 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200118
Victor Stinnerc379ead2011-10-03 12:52:27 +0200119#define _PyUnicode_SHARE_UTF8(op) \
120 (assert(_PyUnicode_CHECK(op)), \
121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op) \
124 (assert(_PyUnicode_CHECK(op)), \
125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127/* true if the Unicode object has an allocated UTF-8 memory block
128 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200130 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200131 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
Victor Stinner03490912011-10-03 23:45:12 +0200134/* true if the Unicode object has an allocated wstr memory block
135 (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200137 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200138 (!PyUnicode_IS_READY(op) || \
139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100148 to_type *_to = (to_type *)(to); \
149 const from_type *_iter = (from_type *)(begin); \
150 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 Py_ssize_t n = (_end) - (_iter); \
152 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200153 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200154 while (_iter < (_unrolled_end)) { \
155 _to[0] = (to_type) _iter[0]; \
156 _to[1] = (to_type) _iter[1]; \
157 _to[2] = (to_type) _iter[2]; \
158 _to[3] = (to_type) _iter[3]; \
159 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200160 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200161 while (_iter < (_end)) \
162 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200164
Walter Dörwald16807132007-05-25 13:52:07 +0000165/* This dictionary holds all interned unicode strings. Note that references
166 to strings in this dictionary are *not* counted in the string's ob_refcnt.
167 When the interned string reaches a refcnt of 0 the string deallocation
168 function will delete the reference from this dictionary.
169
170 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000171 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000172*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200173static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000174
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200176static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200177
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200179 do { \
180 if (unicode_empty != NULL) \
181 Py_INCREF(unicode_empty); \
182 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200183 unicode_empty = PyUnicode_New(0, 0); \
184 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200185 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
187 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200189 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190
Serhiy Storchaka678db842013-01-26 12:16:36 +0200191#define _Py_RETURN_UNICODE_EMPTY() \
192 do { \
193 _Py_INCREF_UNICODE_EMPTY(); \
194 return unicode_empty; \
195 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200197/* Forward declaration */
198Py_LOCAL_INLINE(int)
199_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
200
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200201/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200203
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200206static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100242static int unicode_modifiable(PyObject *unicode);
243
Victor Stinnerfe226c02011-10-03 03:52:20 +0200244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100246_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200247static PyObject *
248_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
249static PyObject *
250_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
251
252static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000253unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000254 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100255 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000256 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
257
Alexander Belopolsky40018472011-02-26 01:02:56 +0000258static void
259raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300260 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100261 PyObject *unicode,
262 Py_ssize_t startpos, Py_ssize_t endpos,
263 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000264
Christian Heimes190d79e2008-01-30 11:58:22 +0000265/* Same for linebreaks */
266static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000268/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000269/* 0x000B, * LINE TABULATION */
270/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000271/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000272 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000274/* 0x001C, * FILE SEPARATOR */
275/* 0x001D, * GROUP SEPARATOR */
276/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 0, 0, 0, 0, 1, 1, 1, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000282
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000291};
292
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300293/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
294 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000295Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000296PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000297{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000298#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000300#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 /* This is actually an illegal character, so it should
302 not be passed to unichr. */
303 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000304#endif
305}
306
Victor Stinner910337b2011-10-03 03:20:16 +0200307#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200308int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100309_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200310{
311 PyASCIIObject *ascii;
312 unsigned int kind;
313
314 assert(PyUnicode_Check(op));
315
316 ascii = (PyASCIIObject *)op;
317 kind = ascii->state.kind;
318
Victor Stinnera3b334d2011-10-03 13:53:37 +0200319 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200320 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200321 assert(ascii->state.ready == 1);
322 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200323 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200324 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200325 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200326
Victor Stinnera41463c2011-10-04 01:05:08 +0200327 if (ascii->state.compact == 1) {
328 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200329 assert(kind == PyUnicode_1BYTE_KIND
330 || kind == PyUnicode_2BYTE_KIND
331 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200332 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200333 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100335 }
336 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200337 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
338
339 data = unicode->data.any;
340 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100341 assert(ascii->length == 0);
342 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200343 assert(ascii->state.compact == 0);
344 assert(ascii->state.ascii == 0);
345 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100346 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200347 assert(ascii->wstr != NULL);
348 assert(data == NULL);
349 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200350 }
351 else {
352 assert(kind == PyUnicode_1BYTE_KIND
353 || kind == PyUnicode_2BYTE_KIND
354 || kind == PyUnicode_4BYTE_KIND);
355 assert(ascii->state.compact == 0);
356 assert(ascii->state.ready == 1);
357 assert(data != NULL);
358 if (ascii->state.ascii) {
359 assert (compact->utf8 == data);
360 assert (compact->utf8_length == ascii->length);
361 }
362 else
363 assert (compact->utf8 != data);
364 }
365 }
366 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200367 if (
368#if SIZEOF_WCHAR_T == 2
369 kind == PyUnicode_2BYTE_KIND
370#else
371 kind == PyUnicode_4BYTE_KIND
372#endif
373 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200374 {
375 assert(ascii->wstr == data);
376 assert(compact->wstr_length == ascii->length);
377 } else
378 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200379 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200380
381 if (compact->utf8 == NULL)
382 assert(compact->utf8_length == 0);
383 if (ascii->wstr == NULL)
384 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200385 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200386 /* check that the best kind is used */
387 if (check_content && kind != PyUnicode_WCHAR_KIND)
388 {
389 Py_ssize_t i;
390 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200391 void *data;
392 Py_UCS4 ch;
393
394 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200395 for (i=0; i < ascii->length; i++)
396 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200397 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200398 if (ch > maxchar)
399 maxchar = ch;
400 }
401 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100402 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200403 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100404 assert(maxchar <= 255);
405 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200406 else
407 assert(maxchar < 128);
408 }
Victor Stinner77faf692011-11-20 18:56:05 +0100409 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200410 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100411 assert(maxchar <= 0xFFFF);
412 }
413 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200414 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100415 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100416 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200417 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200418 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400419 return 1;
420}
Victor Stinner910337b2011-10-03 03:20:16 +0200421#endif
422
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100423static PyObject*
424unicode_result_wchar(PyObject *unicode)
425{
426#ifndef Py_DEBUG
427 Py_ssize_t len;
428
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100429 len = _PyUnicode_WSTR_LENGTH(unicode);
430 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100431 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200432 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 }
434
435 if (len == 1) {
436 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100437 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100438 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
439 Py_DECREF(unicode);
440 return latin1_char;
441 }
442 }
443
444 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200445 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100446 return NULL;
447 }
448#else
Victor Stinneraa771272012-10-04 02:32:58 +0200449 assert(Py_REFCNT(unicode) == 1);
450
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100451 /* don't make the result ready in debug mode to ensure that the caller
452 makes the string ready before using it */
453 assert(_PyUnicode_CheckConsistency(unicode, 1));
454#endif
455 return unicode;
456}
457
458static PyObject*
459unicode_result_ready(PyObject *unicode)
460{
461 Py_ssize_t length;
462
463 length = PyUnicode_GET_LENGTH(unicode);
464 if (length == 0) {
465 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100466 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200467 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100468 }
469 return unicode_empty;
470 }
471
472 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200473 void *data = PyUnicode_DATA(unicode);
474 int kind = PyUnicode_KIND(unicode);
475 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100476 if (ch < 256) {
477 PyObject *latin1_char = unicode_latin1[ch];
478 if (latin1_char != NULL) {
479 if (unicode != latin1_char) {
480 Py_INCREF(latin1_char);
481 Py_DECREF(unicode);
482 }
483 return latin1_char;
484 }
485 else {
486 assert(_PyUnicode_CheckConsistency(unicode, 1));
487 Py_INCREF(unicode);
488 unicode_latin1[ch] = unicode;
489 return unicode;
490 }
491 }
492 }
493
494 assert(_PyUnicode_CheckConsistency(unicode, 1));
495 return unicode;
496}
497
498static PyObject*
499unicode_result(PyObject *unicode)
500{
501 assert(_PyUnicode_CHECK(unicode));
502 if (PyUnicode_IS_READY(unicode))
503 return unicode_result_ready(unicode);
504 else
505 return unicode_result_wchar(unicode);
506}
507
Victor Stinnerc4b49542011-12-11 22:44:26 +0100508static PyObject*
509unicode_result_unchanged(PyObject *unicode)
510{
511 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500512 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100513 return NULL;
514 Py_INCREF(unicode);
515 return unicode;
516 }
517 else
518 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100519 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100520}
521
Victor Stinner3a50e702011-10-18 21:21:00 +0200522#ifdef HAVE_MBCS
523static OSVERSIONINFOEX winver;
524#endif
525
Thomas Wouters477c8d52006-05-27 19:21:47 +0000526/* --- Bloom Filters ----------------------------------------------------- */
527
528/* stuff to implement simple "bloom filters" for Unicode characters.
529 to keep things simple, we use a single bitmask, using the least 5
530 bits from each unicode characters as the bit index. */
531
532/* the linebreak mask is set up by Unicode_Init below */
533
Antoine Pitrouf068f942010-01-13 14:19:12 +0000534#if LONG_BIT >= 128
535#define BLOOM_WIDTH 128
536#elif LONG_BIT >= 64
537#define BLOOM_WIDTH 64
538#elif LONG_BIT >= 32
539#define BLOOM_WIDTH 32
540#else
541#error "LONG_BIT is smaller than 32"
542#endif
543
Thomas Wouters477c8d52006-05-27 19:21:47 +0000544#define BLOOM_MASK unsigned long
545
Serhiy Storchaka05997252013-01-26 12:14:02 +0200546static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
Antoine Pitrouf068f942010-01-13 14:19:12 +0000548#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000549
Benjamin Peterson29060642009-01-31 22:14:21 +0000550#define BLOOM_LINEBREAK(ch) \
551 ((ch) < 128U ? ascii_linebreak[(ch)] : \
552 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000553
Alexander Belopolsky40018472011-02-26 01:02:56 +0000554Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200555make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000556{
Victor Stinnera85af502013-04-09 21:53:54 +0200557#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
558 do { \
559 TYPE *data = (TYPE *)PTR; \
560 TYPE *end = data + LEN; \
561 Py_UCS4 ch; \
562 for (; data != end; data++) { \
563 ch = *data; \
564 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
565 } \
566 break; \
567 } while (0)
568
Thomas Wouters477c8d52006-05-27 19:21:47 +0000569 /* calculate simple bloom-style bitmask for a given unicode string */
570
Antoine Pitrouf068f942010-01-13 14:19:12 +0000571 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000572
573 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200574 switch (kind) {
575 case PyUnicode_1BYTE_KIND:
576 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
577 break;
578 case PyUnicode_2BYTE_KIND:
579 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
580 break;
581 case PyUnicode_4BYTE_KIND:
582 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
583 break;
584 default:
585 assert(0);
586 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000587 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200588
589#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000590}
591
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200592/* Compilation of templated routines */
593
594#include "stringlib/asciilib.h"
595#include "stringlib/fastsearch.h"
596#include "stringlib/partition.h"
597#include "stringlib/split.h"
598#include "stringlib/count.h"
599#include "stringlib/find.h"
600#include "stringlib/find_max_char.h"
601#include "stringlib/localeutil.h"
602#include "stringlib/undef.h"
603
604#include "stringlib/ucs1lib.h"
605#include "stringlib/fastsearch.h"
606#include "stringlib/partition.h"
607#include "stringlib/split.h"
608#include "stringlib/count.h"
609#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300610#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200611#include "stringlib/find_max_char.h"
612#include "stringlib/localeutil.h"
613#include "stringlib/undef.h"
614
615#include "stringlib/ucs2lib.h"
616#include "stringlib/fastsearch.h"
617#include "stringlib/partition.h"
618#include "stringlib/split.h"
619#include "stringlib/count.h"
620#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300621#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200622#include "stringlib/find_max_char.h"
623#include "stringlib/localeutil.h"
624#include "stringlib/undef.h"
625
626#include "stringlib/ucs4lib.h"
627#include "stringlib/fastsearch.h"
628#include "stringlib/partition.h"
629#include "stringlib/split.h"
630#include "stringlib/count.h"
631#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300632#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200633#include "stringlib/find_max_char.h"
634#include "stringlib/localeutil.h"
635#include "stringlib/undef.h"
636
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200637#include "stringlib/unicodedefs.h"
638#include "stringlib/fastsearch.h"
639#include "stringlib/count.h"
640#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100641#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200642
Guido van Rossumd57fd912000-03-10 22:53:23 +0000643/* --- Unicode Object ----------------------------------------------------- */
644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200646fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200647
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200648Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
649 Py_ssize_t size, Py_UCS4 ch,
650 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200651{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200652 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
653
654 switch (kind) {
655 case PyUnicode_1BYTE_KIND:
656 {
657 Py_UCS1 ch1 = (Py_UCS1) ch;
658 if (ch1 == ch)
659 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
660 else
661 return -1;
662 }
663 case PyUnicode_2BYTE_KIND:
664 {
665 Py_UCS2 ch2 = (Py_UCS2) ch;
666 if (ch2 == ch)
667 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
668 else
669 return -1;
670 }
671 case PyUnicode_4BYTE_KIND:
672 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
673 default:
674 assert(0);
675 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200677}
678
Victor Stinnerafffce42012-10-03 23:03:17 +0200679#ifdef Py_DEBUG
680/* Fill the data of an Unicode string with invalid characters to detect bugs
681 earlier.
682
683 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
684 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
685 invalid character in Unicode 6.0. */
686static void
687unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
688{
689 int kind = PyUnicode_KIND(unicode);
690 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
691 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
692 if (length <= old_length)
693 return;
694 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
695}
696#endif
697
Victor Stinnerfe226c02011-10-03 03:52:20 +0200698static PyObject*
699resize_compact(PyObject *unicode, Py_ssize_t length)
700{
701 Py_ssize_t char_size;
702 Py_ssize_t struct_size;
703 Py_ssize_t new_size;
704 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100705 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200706#ifdef Py_DEBUG
707 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
708#endif
709
Victor Stinner79891572012-05-03 13:43:07 +0200710 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200711 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100712 assert(PyUnicode_IS_COMPACT(unicode));
713
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200714 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100715 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200716 struct_size = sizeof(PyASCIIObject);
717 else
718 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200719 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200720
Victor Stinnerfe226c02011-10-03 03:52:20 +0200721 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
722 PyErr_NoMemory();
723 return NULL;
724 }
725 new_size = (struct_size + (length + 1) * char_size);
726
Victor Stinner84def372011-12-11 20:04:56 +0100727 _Py_DEC_REFTOTAL;
728 _Py_ForgetReference(unicode);
729
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300730 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100731 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100732 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200733 PyErr_NoMemory();
734 return NULL;
735 }
Victor Stinner84def372011-12-11 20:04:56 +0100736 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100738
Victor Stinnerfe226c02011-10-03 03:52:20 +0200739 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200740 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200741 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100742 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200743 _PyUnicode_WSTR_LENGTH(unicode) = length;
744 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100745 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
746 PyObject_DEL(_PyUnicode_WSTR(unicode));
747 _PyUnicode_WSTR(unicode) = NULL;
748 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200749#ifdef Py_DEBUG
750 unicode_fill_invalid(unicode, old_length);
751#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200752 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
753 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200754 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200755 return unicode;
756}
757
Alexander Belopolsky40018472011-02-26 01:02:56 +0000758static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200759resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000760{
Victor Stinner95663112011-10-04 01:03:50 +0200761 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100762 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200763 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000765
Victor Stinnerfe226c02011-10-03 03:52:20 +0200766 if (PyUnicode_IS_READY(unicode)) {
767 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200768 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200770#ifdef Py_DEBUG
771 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
772#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200773
774 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200775 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200776 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
777 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200778
779 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
780 PyErr_NoMemory();
781 return -1;
782 }
783 new_size = (length + 1) * char_size;
784
Victor Stinner7a9105a2011-12-12 00:13:42 +0100785 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
786 {
787 PyObject_DEL(_PyUnicode_UTF8(unicode));
788 _PyUnicode_UTF8(unicode) = NULL;
789 _PyUnicode_UTF8_LENGTH(unicode) = 0;
790 }
791
Victor Stinnerfe226c02011-10-03 03:52:20 +0200792 data = (PyObject *)PyObject_REALLOC(data, new_size);
793 if (data == NULL) {
794 PyErr_NoMemory();
795 return -1;
796 }
797 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200798 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200799 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200800 _PyUnicode_WSTR_LENGTH(unicode) = length;
801 }
802 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200803 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200804 _PyUnicode_UTF8_LENGTH(unicode) = length;
805 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200806 _PyUnicode_LENGTH(unicode) = length;
807 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200808#ifdef Py_DEBUG
809 unicode_fill_invalid(unicode, old_length);
810#endif
Victor Stinner95663112011-10-04 01:03:50 +0200811 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200812 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200813 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200814 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200815 }
Victor Stinner95663112011-10-04 01:03:50 +0200816 assert(_PyUnicode_WSTR(unicode) != NULL);
817
818 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -0700819 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +0200820 PyErr_NoMemory();
821 return -1;
822 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100823 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200824 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100825 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200826 if (!wstr) {
827 PyErr_NoMemory();
828 return -1;
829 }
830 _PyUnicode_WSTR(unicode) = wstr;
831 _PyUnicode_WSTR(unicode)[length] = 0;
832 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200833 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000834 return 0;
835}
836
Victor Stinnerfe226c02011-10-03 03:52:20 +0200837static PyObject*
838resize_copy(PyObject *unicode, Py_ssize_t length)
839{
840 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100841 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200842 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100843
Benjamin Petersonbac79492012-01-14 13:34:47 -0500844 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100845 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200846
847 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
848 if (copy == NULL)
849 return NULL;
850
851 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200852 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200853 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200854 }
855 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200856 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100857
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200858 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200859 if (w == NULL)
860 return NULL;
861 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
862 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200863 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
864 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200865 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200866 }
867}
868
Guido van Rossumd57fd912000-03-10 22:53:23 +0000869/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000870 Ux0000 terminated; some code (e.g. new_identifier)
871 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000872
873 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000874 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000875
876*/
877
Alexander Belopolsky40018472011-02-26 01:02:56 +0000878static PyUnicodeObject *
879_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200881 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200882 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000883
Thomas Wouters477c8d52006-05-27 19:21:47 +0000884 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000885 if (length == 0 && unicode_empty != NULL) {
886 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200887 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000888 }
889
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000890 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -0700891 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000892 return (PyUnicodeObject *)PyErr_NoMemory();
893 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200894 if (length < 0) {
895 PyErr_SetString(PyExc_SystemError,
896 "Negative size passed to _PyUnicode_New");
897 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000898 }
899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200900 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
901 if (unicode == NULL)
902 return NULL;
903 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +0100904
905 _PyUnicode_WSTR_LENGTH(unicode) = length;
906 _PyUnicode_HASH(unicode) = -1;
907 _PyUnicode_STATE(unicode).interned = 0;
908 _PyUnicode_STATE(unicode).kind = 0;
909 _PyUnicode_STATE(unicode).compact = 0;
910 _PyUnicode_STATE(unicode).ready = 0;
911 _PyUnicode_STATE(unicode).ascii = 0;
912 _PyUnicode_DATA_ANY(unicode) = NULL;
913 _PyUnicode_LENGTH(unicode) = 0;
914 _PyUnicode_UTF8(unicode) = NULL;
915 _PyUnicode_UTF8_LENGTH(unicode) = 0;
916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200917 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
918 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100919 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000920 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100921 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000922 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200923
Jeremy Hyltond8082792003-09-16 19:41:39 +0000924 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000925 * the caller fails before initializing str -- unicode_resize()
926 * reads str[0], and the Keep-Alive optimization can keep memory
927 * allocated for str alive across a call to unicode_dealloc(unicode).
928 * We don't want unicode_resize to read uninitialized memory in
929 * that case.
930 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200931 _PyUnicode_WSTR(unicode)[0] = 0;
932 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +0100933
Victor Stinner7931d9a2011-11-04 00:22:48 +0100934 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000935 return unicode;
936}
937
Victor Stinnerf42dc442011-10-02 23:33:16 +0200938static const char*
939unicode_kind_name(PyObject *unicode)
940{
Victor Stinner42dfd712011-10-03 14:41:45 +0200941 /* don't check consistency: unicode_kind_name() is called from
942 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200943 if (!PyUnicode_IS_COMPACT(unicode))
944 {
945 if (!PyUnicode_IS_READY(unicode))
946 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600947 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200948 {
949 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200950 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200951 return "legacy ascii";
952 else
953 return "legacy latin1";
954 case PyUnicode_2BYTE_KIND:
955 return "legacy UCS2";
956 case PyUnicode_4BYTE_KIND:
957 return "legacy UCS4";
958 default:
959 return "<legacy invalid kind>";
960 }
961 }
962 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600963 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200964 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200965 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200966 return "ascii";
967 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200968 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200969 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200970 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200971 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200972 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200973 default:
974 return "<invalid compact kind>";
975 }
976}
977
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200978#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200979/* Functions wrapping macros for use in debugger */
980char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200981 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982}
983
984void *_PyUnicode_compact_data(void *unicode) {
985 return _PyUnicode_COMPACT_DATA(unicode);
986}
987void *_PyUnicode_data(void *unicode){
988 printf("obj %p\n", unicode);
989 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
990 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
991 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
992 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
993 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
994 return PyUnicode_DATA(unicode);
995}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200996
997void
998_PyUnicode_Dump(PyObject *op)
999{
1000 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001001 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1002 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1003 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001004
Victor Stinnera849a4b2011-10-03 12:12:11 +02001005 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001006 {
1007 if (ascii->state.ascii)
1008 data = (ascii + 1);
1009 else
1010 data = (compact + 1);
1011 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001012 else
1013 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001014 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1015 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001016
Victor Stinnera849a4b2011-10-03 12:12:11 +02001017 if (ascii->wstr == data)
1018 printf("shared ");
1019 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001020
Victor Stinnera3b334d2011-10-03 13:53:37 +02001021 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001022 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001023 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1024 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001025 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1026 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001027 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001028 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001029}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001030#endif
1031
1032PyObject *
1033PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1034{
1035 PyObject *obj;
1036 PyCompactUnicodeObject *unicode;
1037 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001038 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001039 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001040 Py_ssize_t char_size;
1041 Py_ssize_t struct_size;
1042
1043 /* Optimization for empty strings */
1044 if (size == 0 && unicode_empty != NULL) {
1045 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001046 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001047 }
1048
Victor Stinner9e9d6892011-10-04 01:02:02 +02001049 is_ascii = 0;
1050 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001051 struct_size = sizeof(PyCompactUnicodeObject);
1052 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001053 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001054 char_size = 1;
1055 is_ascii = 1;
1056 struct_size = sizeof(PyASCIIObject);
1057 }
1058 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001059 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001060 char_size = 1;
1061 }
1062 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001063 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001064 char_size = 2;
1065 if (sizeof(wchar_t) == 2)
1066 is_sharing = 1;
1067 }
1068 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001069 if (maxchar > MAX_UNICODE) {
1070 PyErr_SetString(PyExc_SystemError,
1071 "invalid maximum character passed to PyUnicode_New");
1072 return NULL;
1073 }
Victor Stinner8f825062012-04-27 13:55:39 +02001074 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001075 char_size = 4;
1076 if (sizeof(wchar_t) == 4)
1077 is_sharing = 1;
1078 }
1079
1080 /* Ensure we won't overflow the size. */
1081 if (size < 0) {
1082 PyErr_SetString(PyExc_SystemError,
1083 "Negative size passed to PyUnicode_New");
1084 return NULL;
1085 }
1086 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1087 return PyErr_NoMemory();
1088
1089 /* Duplicated allocation code from _PyObject_New() instead of a call to
1090 * PyObject_New() so we are able to allocate space for the object and
1091 * it's data buffer.
1092 */
1093 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1094 if (obj == NULL)
1095 return PyErr_NoMemory();
1096 obj = PyObject_INIT(obj, &PyUnicode_Type);
1097 if (obj == NULL)
1098 return NULL;
1099
1100 unicode = (PyCompactUnicodeObject *)obj;
1101 if (is_ascii)
1102 data = ((PyASCIIObject*)obj) + 1;
1103 else
1104 data = unicode + 1;
1105 _PyUnicode_LENGTH(unicode) = size;
1106 _PyUnicode_HASH(unicode) = -1;
1107 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001108 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001109 _PyUnicode_STATE(unicode).compact = 1;
1110 _PyUnicode_STATE(unicode).ready = 1;
1111 _PyUnicode_STATE(unicode).ascii = is_ascii;
1112 if (is_ascii) {
1113 ((char*)data)[size] = 0;
1114 _PyUnicode_WSTR(unicode) = NULL;
1115 }
Victor Stinner8f825062012-04-27 13:55:39 +02001116 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001117 ((char*)data)[size] = 0;
1118 _PyUnicode_WSTR(unicode) = NULL;
1119 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001121 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001122 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001123 else {
1124 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001125 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001126 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001127 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001128 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001129 ((Py_UCS4*)data)[size] = 0;
1130 if (is_sharing) {
1131 _PyUnicode_WSTR_LENGTH(unicode) = size;
1132 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1133 }
1134 else {
1135 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1136 _PyUnicode_WSTR(unicode) = NULL;
1137 }
1138 }
Victor Stinner8f825062012-04-27 13:55:39 +02001139#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001140 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001141#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001142 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001143 return obj;
1144}
1145
1146#if SIZEOF_WCHAR_T == 2
1147/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1148 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001149 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001150
1151 This function assumes that unicode can hold one more code point than wstr
1152 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001153static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001154unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001155 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001156{
1157 const wchar_t *iter;
1158 Py_UCS4 *ucs4_out;
1159
Victor Stinner910337b2011-10-03 03:20:16 +02001160 assert(unicode != NULL);
1161 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001162 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1163 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1164
1165 for (iter = begin; iter < end; ) {
1166 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1167 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001168 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1169 && (iter+1) < end
1170 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171 {
Victor Stinner551ac952011-11-29 22:58:13 +01001172 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001173 iter += 2;
1174 }
1175 else {
1176 *ucs4_out++ = *iter;
1177 iter++;
1178 }
1179 }
1180 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1181 _PyUnicode_GET_LENGTH(unicode)));
1182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001183}
1184#endif
1185
Victor Stinnercd9950f2011-10-02 00:34:53 +02001186static int
Victor Stinner488fa492011-12-12 00:01:39 +01001187unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001188{
Victor Stinner488fa492011-12-12 00:01:39 +01001189 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001190 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001191 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001192 return -1;
1193 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001194 return 0;
1195}
1196
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001197static int
1198_copy_characters(PyObject *to, Py_ssize_t to_start,
1199 PyObject *from, Py_ssize_t from_start,
1200 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001201{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001202 unsigned int from_kind, to_kind;
1203 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001204
Victor Stinneree4544c2012-05-09 22:24:08 +02001205 assert(0 <= how_many);
1206 assert(0 <= from_start);
1207 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001208 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001209 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001210 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001211
Victor Stinnerd3f08822012-05-29 12:57:52 +02001212 assert(PyUnicode_Check(to));
1213 assert(PyUnicode_IS_READY(to));
1214 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1215
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001216 if (how_many == 0)
1217 return 0;
1218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001219 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001220 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001221 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001222 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001223
Victor Stinnerf1852262012-06-16 16:38:26 +02001224#ifdef Py_DEBUG
1225 if (!check_maxchar
1226 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1227 {
1228 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1229 Py_UCS4 ch;
1230 Py_ssize_t i;
1231 for (i=0; i < how_many; i++) {
1232 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1233 assert(ch <= to_maxchar);
1234 }
1235 }
1236#endif
1237
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001238 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001239 if (check_maxchar
1240 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1241 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001242 /* Writing Latin-1 characters into an ASCII string requires to
1243 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001244 Py_UCS4 max_char;
1245 max_char = ucs1lib_find_max_char(from_data,
1246 (Py_UCS1*)from_data + how_many);
1247 if (max_char >= 128)
1248 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001249 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001250 Py_MEMCPY((char*)to_data + to_kind * to_start,
1251 (char*)from_data + from_kind * from_start,
1252 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001253 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001254 else if (from_kind == PyUnicode_1BYTE_KIND
1255 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001256 {
1257 _PyUnicode_CONVERT_BYTES(
1258 Py_UCS1, Py_UCS2,
1259 PyUnicode_1BYTE_DATA(from) + from_start,
1260 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1261 PyUnicode_2BYTE_DATA(to) + to_start
1262 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001263 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001264 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001265 && to_kind == PyUnicode_4BYTE_KIND)
1266 {
1267 _PyUnicode_CONVERT_BYTES(
1268 Py_UCS1, Py_UCS4,
1269 PyUnicode_1BYTE_DATA(from) + from_start,
1270 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1271 PyUnicode_4BYTE_DATA(to) + to_start
1272 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001273 }
1274 else if (from_kind == PyUnicode_2BYTE_KIND
1275 && to_kind == PyUnicode_4BYTE_KIND)
1276 {
1277 _PyUnicode_CONVERT_BYTES(
1278 Py_UCS2, Py_UCS4,
1279 PyUnicode_2BYTE_DATA(from) + from_start,
1280 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1281 PyUnicode_4BYTE_DATA(to) + to_start
1282 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001283 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001284 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001285 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1286
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001287 if (!check_maxchar) {
1288 if (from_kind == PyUnicode_2BYTE_KIND
1289 && to_kind == PyUnicode_1BYTE_KIND)
1290 {
1291 _PyUnicode_CONVERT_BYTES(
1292 Py_UCS2, Py_UCS1,
1293 PyUnicode_2BYTE_DATA(from) + from_start,
1294 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1295 PyUnicode_1BYTE_DATA(to) + to_start
1296 );
1297 }
1298 else if (from_kind == PyUnicode_4BYTE_KIND
1299 && to_kind == PyUnicode_1BYTE_KIND)
1300 {
1301 _PyUnicode_CONVERT_BYTES(
1302 Py_UCS4, Py_UCS1,
1303 PyUnicode_4BYTE_DATA(from) + from_start,
1304 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1305 PyUnicode_1BYTE_DATA(to) + to_start
1306 );
1307 }
1308 else if (from_kind == PyUnicode_4BYTE_KIND
1309 && to_kind == PyUnicode_2BYTE_KIND)
1310 {
1311 _PyUnicode_CONVERT_BYTES(
1312 Py_UCS4, Py_UCS2,
1313 PyUnicode_4BYTE_DATA(from) + from_start,
1314 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1315 PyUnicode_2BYTE_DATA(to) + to_start
1316 );
1317 }
1318 else {
1319 assert(0);
1320 return -1;
1321 }
1322 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001323 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001324 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001325 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001326 Py_ssize_t i;
1327
Victor Stinnera0702ab2011-09-29 14:14:38 +02001328 for (i=0; i < how_many; i++) {
1329 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001330 if (ch > to_maxchar)
1331 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001332 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1333 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001334 }
1335 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001336 return 0;
1337}
1338
Victor Stinnerd3f08822012-05-29 12:57:52 +02001339void
1340_PyUnicode_FastCopyCharacters(
1341 PyObject *to, Py_ssize_t to_start,
1342 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001343{
1344 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1345}
1346
1347Py_ssize_t
1348PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1349 PyObject *from, Py_ssize_t from_start,
1350 Py_ssize_t how_many)
1351{
1352 int err;
1353
1354 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1355 PyErr_BadInternalCall();
1356 return -1;
1357 }
1358
Benjamin Petersonbac79492012-01-14 13:34:47 -05001359 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001360 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001361 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001362 return -1;
1363
Victor Stinnerd3f08822012-05-29 12:57:52 +02001364 if (from_start < 0) {
1365 PyErr_SetString(PyExc_IndexError, "string index out of range");
1366 return -1;
1367 }
1368 if (to_start < 0) {
1369 PyErr_SetString(PyExc_IndexError, "string index out of range");
1370 return -1;
1371 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001372 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1373 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1374 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001375 "Cannot write %zi characters at %zi "
1376 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001377 how_many, to_start, PyUnicode_GET_LENGTH(to));
1378 return -1;
1379 }
1380
1381 if (how_many == 0)
1382 return 0;
1383
Victor Stinner488fa492011-12-12 00:01:39 +01001384 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001385 return -1;
1386
1387 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1388 if (err) {
1389 PyErr_Format(PyExc_SystemError,
1390 "Cannot copy %s characters "
1391 "into a string of %s characters",
1392 unicode_kind_name(from),
1393 unicode_kind_name(to));
1394 return -1;
1395 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001396 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001397}
1398
Victor Stinner17222162011-09-28 22:15:37 +02001399/* Find the maximum code point and count the number of surrogate pairs so a
1400 correct string length can be computed before converting a string to UCS4.
1401 This function counts single surrogates as a character and not as a pair.
1402
1403 Return 0 on success, or -1 on error. */
1404static int
1405find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1406 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001407{
1408 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001409 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410
Victor Stinnerc53be962011-10-02 21:33:54 +02001411 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001412 *num_surrogates = 0;
1413 *maxchar = 0;
1414
1415 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001417 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1418 && (iter+1) < end
1419 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1420 {
1421 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1422 ++(*num_surrogates);
1423 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 }
1425 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001427 {
1428 ch = *iter;
1429 iter++;
1430 }
1431 if (ch > *maxchar) {
1432 *maxchar = ch;
1433 if (*maxchar > MAX_UNICODE) {
1434 PyErr_Format(PyExc_ValueError,
1435 "character U+%x is not in range [U+0000; U+10ffff]",
1436 ch);
1437 return -1;
1438 }
1439 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440 }
1441 return 0;
1442}
1443
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001444int
1445_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001446{
1447 wchar_t *end;
1448 Py_UCS4 maxchar = 0;
1449 Py_ssize_t num_surrogates;
1450#if SIZEOF_WCHAR_T == 2
1451 Py_ssize_t length_wo_surrogates;
1452#endif
1453
Georg Brandl7597add2011-10-05 16:36:47 +02001454 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001455 strings were created using _PyObject_New() and where no canonical
1456 representation (the str field) has been set yet aka strings
1457 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001458 assert(_PyUnicode_CHECK(unicode));
1459 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001460 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001461 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001462 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001463 /* Actually, it should neither be interned nor be anything else: */
1464 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001465
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001466 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001467 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001468 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001469 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470
1471 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001472 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1473 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001474 PyErr_NoMemory();
1475 return -1;
1476 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001477 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 _PyUnicode_WSTR(unicode), end,
1479 PyUnicode_1BYTE_DATA(unicode));
1480 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1481 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1482 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1483 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001484 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001485 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001486 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 }
1488 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001489 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001490 _PyUnicode_UTF8(unicode) = NULL;
1491 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001492 }
1493 PyObject_FREE(_PyUnicode_WSTR(unicode));
1494 _PyUnicode_WSTR(unicode) = NULL;
1495 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1496 }
1497 /* In this case we might have to convert down from 4-byte native
1498 wchar_t to 2-byte unicode. */
1499 else if (maxchar < 65536) {
1500 assert(num_surrogates == 0 &&
1501 "FindMaxCharAndNumSurrogatePairs() messed up");
1502
Victor Stinner506f5922011-09-28 22:34:18 +02001503#if SIZEOF_WCHAR_T == 2
1504 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001505 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001506 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1507 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1508 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001509 _PyUnicode_UTF8(unicode) = NULL;
1510 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001511#else
1512 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001513 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001514 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001515 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001516 PyErr_NoMemory();
1517 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001518 }
Victor Stinner506f5922011-09-28 22:34:18 +02001519 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1520 _PyUnicode_WSTR(unicode), end,
1521 PyUnicode_2BYTE_DATA(unicode));
1522 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1523 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1524 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001525 _PyUnicode_UTF8(unicode) = NULL;
1526 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001527 PyObject_FREE(_PyUnicode_WSTR(unicode));
1528 _PyUnicode_WSTR(unicode) = NULL;
1529 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1530#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001531 }
1532 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1533 else {
1534#if SIZEOF_WCHAR_T == 2
1535 /* in case the native representation is 2-bytes, we need to allocate a
1536 new normalized 4-byte version. */
1537 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001538 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1539 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001540 PyErr_NoMemory();
1541 return -1;
1542 }
1543 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1544 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001545 _PyUnicode_UTF8(unicode) = NULL;
1546 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001547 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1548 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001549 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001550 PyObject_FREE(_PyUnicode_WSTR(unicode));
1551 _PyUnicode_WSTR(unicode) = NULL;
1552 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1553#else
1554 assert(num_surrogates == 0);
1555
Victor Stinnerc3c74152011-10-02 20:39:55 +02001556 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001557 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001558 _PyUnicode_UTF8(unicode) = NULL;
1559 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001560 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1561#endif
1562 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1563 }
1564 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001565 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001566 return 0;
1567}
1568
Alexander Belopolsky40018472011-02-26 01:02:56 +00001569static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001570unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001571{
Walter Dörwald16807132007-05-25 13:52:07 +00001572 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001573 case SSTATE_NOT_INTERNED:
1574 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001575
Benjamin Peterson29060642009-01-31 22:14:21 +00001576 case SSTATE_INTERNED_MORTAL:
1577 /* revive dead object temporarily for DelItem */
1578 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001579 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001580 Py_FatalError(
1581 "deletion of interned string failed");
1582 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001583
Benjamin Peterson29060642009-01-31 22:14:21 +00001584 case SSTATE_INTERNED_IMMORTAL:
1585 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001586
Benjamin Peterson29060642009-01-31 22:14:21 +00001587 default:
1588 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001589 }
1590
Victor Stinner03490912011-10-03 23:45:12 +02001591 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001592 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001593 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001594 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001595 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1596 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001597
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001598 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001599}
1600
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001601#ifdef Py_DEBUG
1602static int
1603unicode_is_singleton(PyObject *unicode)
1604{
1605 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1606 if (unicode == unicode_empty)
1607 return 1;
1608 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1609 {
1610 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1611 if (ch < 256 && unicode_latin1[ch] == unicode)
1612 return 1;
1613 }
1614 return 0;
1615}
1616#endif
1617
Alexander Belopolsky40018472011-02-26 01:02:56 +00001618static int
Victor Stinner488fa492011-12-12 00:01:39 +01001619unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001620{
Victor Stinner488fa492011-12-12 00:01:39 +01001621 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001622 if (Py_REFCNT(unicode) != 1)
1623 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001624 if (_PyUnicode_HASH(unicode) != -1)
1625 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001626 if (PyUnicode_CHECK_INTERNED(unicode))
1627 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001628 if (!PyUnicode_CheckExact(unicode))
1629 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001630#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001631 /* singleton refcount is greater than 1 */
1632 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001633#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001634 return 1;
1635}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001636
Victor Stinnerfe226c02011-10-03 03:52:20 +02001637static int
1638unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1639{
1640 PyObject *unicode;
1641 Py_ssize_t old_length;
1642
1643 assert(p_unicode != NULL);
1644 unicode = *p_unicode;
1645
1646 assert(unicode != NULL);
1647 assert(PyUnicode_Check(unicode));
1648 assert(0 <= length);
1649
Victor Stinner910337b2011-10-03 03:20:16 +02001650 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001651 old_length = PyUnicode_WSTR_LENGTH(unicode);
1652 else
1653 old_length = PyUnicode_GET_LENGTH(unicode);
1654 if (old_length == length)
1655 return 0;
1656
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001657 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001658 _Py_INCREF_UNICODE_EMPTY();
1659 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001660 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001661 Py_DECREF(*p_unicode);
1662 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001663 return 0;
1664 }
1665
Victor Stinner488fa492011-12-12 00:01:39 +01001666 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001667 PyObject *copy = resize_copy(unicode, length);
1668 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001669 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001670 Py_DECREF(*p_unicode);
1671 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001672 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001673 }
1674
Victor Stinnerfe226c02011-10-03 03:52:20 +02001675 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001676 PyObject *new_unicode = resize_compact(unicode, length);
1677 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001678 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001679 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001680 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001681 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001682 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001683}
1684
Alexander Belopolsky40018472011-02-26 01:02:56 +00001685int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001686PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001687{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001688 PyObject *unicode;
1689 if (p_unicode == NULL) {
1690 PyErr_BadInternalCall();
1691 return -1;
1692 }
1693 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001694 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001695 {
1696 PyErr_BadInternalCall();
1697 return -1;
1698 }
1699 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001700}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001701
Victor Stinnerc5166102012-02-22 13:55:02 +01001702/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001703
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001704 WARNING: The function doesn't copy the terminating null character and
1705 doesn't check the maximum character (may write a latin1 character in an
1706 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001707static void
1708unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1709 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001710{
1711 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1712 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001713 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001714
1715 switch (kind) {
1716 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001717 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001718#ifdef Py_DEBUG
1719 if (PyUnicode_IS_ASCII(unicode)) {
1720 Py_UCS4 maxchar = ucs1lib_find_max_char(
1721 (const Py_UCS1*)str,
1722 (const Py_UCS1*)str + len);
1723 assert(maxchar < 128);
1724 }
1725#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001726 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001727 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001728 }
1729 case PyUnicode_2BYTE_KIND: {
1730 Py_UCS2 *start = (Py_UCS2 *)data + index;
1731 Py_UCS2 *ucs2 = start;
1732 assert(index <= PyUnicode_GET_LENGTH(unicode));
1733
Victor Stinner184252a2012-06-16 02:57:41 +02001734 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001735 *ucs2 = (Py_UCS2)*str;
1736
1737 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001738 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001739 }
1740 default: {
1741 Py_UCS4 *start = (Py_UCS4 *)data + index;
1742 Py_UCS4 *ucs4 = start;
1743 assert(kind == PyUnicode_4BYTE_KIND);
1744 assert(index <= PyUnicode_GET_LENGTH(unicode));
1745
Victor Stinner184252a2012-06-16 02:57:41 +02001746 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001747 *ucs4 = (Py_UCS4)*str;
1748
1749 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001750 }
1751 }
1752}
1753
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754static PyObject*
1755get_latin1_char(unsigned char ch)
1756{
Victor Stinnera464fc12011-10-02 20:39:30 +02001757 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001759 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 if (!unicode)
1761 return NULL;
1762 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001763 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001764 unicode_latin1[ch] = unicode;
1765 }
1766 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001767 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768}
1769
Victor Stinner985a82a2014-01-03 12:53:47 +01001770static PyObject*
1771unicode_char(Py_UCS4 ch)
1772{
1773 PyObject *unicode;
1774
1775 assert(ch <= MAX_UNICODE);
1776
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001777 if (ch < 256)
1778 return get_latin1_char(ch);
1779
Victor Stinner985a82a2014-01-03 12:53:47 +01001780 unicode = PyUnicode_New(1, ch);
1781 if (unicode == NULL)
1782 return NULL;
1783 switch (PyUnicode_KIND(unicode)) {
1784 case PyUnicode_1BYTE_KIND:
1785 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1786 break;
1787 case PyUnicode_2BYTE_KIND:
1788 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1789 break;
1790 default:
1791 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1792 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1793 }
1794 assert(_PyUnicode_CheckConsistency(unicode, 1));
1795 return unicode;
1796}
1797
Alexander Belopolsky40018472011-02-26 01:02:56 +00001798PyObject *
1799PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001800{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001801 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001802 Py_UCS4 maxchar = 0;
1803 Py_ssize_t num_surrogates;
1804
1805 if (u == NULL)
1806 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001807
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001808 /* If the Unicode data is known at construction time, we can apply
1809 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001812 if (size == 0)
1813 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001814
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001815 /* Single character Unicode objects in the Latin-1 range are
1816 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001817 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001818 return get_latin1_char((unsigned char)*u);
1819
1820 /* If not empty and not single character, copy the Unicode data
1821 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001822 if (find_maxchar_surrogates(u, u + size,
1823 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001824 return NULL;
1825
Victor Stinner8faf8212011-12-08 22:14:11 +01001826 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001827 if (!unicode)
1828 return NULL;
1829
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001830 switch (PyUnicode_KIND(unicode)) {
1831 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001832 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001833 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1834 break;
1835 case PyUnicode_2BYTE_KIND:
1836#if Py_UNICODE_SIZE == 2
1837 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1838#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001839 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001840 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1841#endif
1842 break;
1843 case PyUnicode_4BYTE_KIND:
1844#if SIZEOF_WCHAR_T == 2
1845 /* This is the only case which has to process surrogates, thus
1846 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001847 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001848#else
1849 assert(num_surrogates == 0);
1850 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1851#endif
1852 break;
1853 default:
1854 assert(0 && "Impossible state");
1855 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001857 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858}
1859
Alexander Belopolsky40018472011-02-26 01:02:56 +00001860PyObject *
1861PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001862{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001863 if (size < 0) {
1864 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001865 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001866 return NULL;
1867 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001868 if (u != NULL)
1869 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1870 else
1871 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001872}
1873
Alexander Belopolsky40018472011-02-26 01:02:56 +00001874PyObject *
1875PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001876{
1877 size_t size = strlen(u);
1878 if (size > PY_SSIZE_T_MAX) {
1879 PyErr_SetString(PyExc_OverflowError, "input too long");
1880 return NULL;
1881 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001882 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001883}
1884
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001885PyObject *
1886_PyUnicode_FromId(_Py_Identifier *id)
1887{
1888 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001889 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1890 strlen(id->string),
1891 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001892 if (!id->object)
1893 return NULL;
1894 PyUnicode_InternInPlace(&id->object);
1895 assert(!id->next);
1896 id->next = static_strings;
1897 static_strings = id;
1898 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001899 return id->object;
1900}
1901
1902void
1903_PyUnicode_ClearStaticStrings()
1904{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001905 _Py_Identifier *tmp, *s = static_strings;
1906 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02001907 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001908 tmp = s->next;
1909 s->next = NULL;
1910 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001911 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001912 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001913}
1914
Benjamin Peterson0df54292012-03-26 14:50:32 -04001915/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001916
Victor Stinnerd3f08822012-05-29 12:57:52 +02001917PyObject*
1918_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001919{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001920 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001921 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001922 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001923#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001924 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001925#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001926 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001927 }
Victor Stinner785938e2011-12-11 20:09:03 +01001928 unicode = PyUnicode_New(size, 127);
1929 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001930 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001931 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1932 assert(_PyUnicode_CheckConsistency(unicode, 1));
1933 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001934}
1935
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001936static Py_UCS4
1937kind_maxchar_limit(unsigned int kind)
1938{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001939 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001940 case PyUnicode_1BYTE_KIND:
1941 return 0x80;
1942 case PyUnicode_2BYTE_KIND:
1943 return 0x100;
1944 case PyUnicode_4BYTE_KIND:
1945 return 0x10000;
1946 default:
1947 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001948 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001949 }
1950}
1951
Victor Stinnere6abb482012-05-02 01:15:40 +02001952Py_LOCAL_INLINE(Py_UCS4)
1953align_maxchar(Py_UCS4 maxchar)
1954{
1955 if (maxchar <= 127)
1956 return 127;
1957 else if (maxchar <= 255)
1958 return 255;
1959 else if (maxchar <= 65535)
1960 return 65535;
1961 else
1962 return MAX_UNICODE;
1963}
1964
Victor Stinner702c7342011-10-05 13:50:52 +02001965static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001966_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001967{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001968 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001969 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001970
Serhiy Storchaka678db842013-01-26 12:16:36 +02001971 if (size == 0)
1972 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001973 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001974 if (size == 1)
1975 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001976
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001977 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001978 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001979 if (!res)
1980 return NULL;
1981 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001982 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001983 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001984}
1985
Victor Stinnere57b1c02011-09-28 22:20:48 +02001986static PyObject*
1987_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001988{
1989 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001990 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001991
Serhiy Storchaka678db842013-01-26 12:16:36 +02001992 if (size == 0)
1993 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001994 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01001995 if (size == 1)
1996 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001997
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001998 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001999 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002000 if (!res)
2001 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002002 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002003 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002004 else {
2005 _PyUnicode_CONVERT_BYTES(
2006 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2007 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002008 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002009 return res;
2010}
2011
Victor Stinnere57b1c02011-09-28 22:20:48 +02002012static PyObject*
2013_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002014{
2015 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002016 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002017
Serhiy Storchaka678db842013-01-26 12:16:36 +02002018 if (size == 0)
2019 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002020 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002021 if (size == 1)
2022 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002023
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002024 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002025 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026 if (!res)
2027 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002028 if (max_char < 256)
2029 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2030 PyUnicode_1BYTE_DATA(res));
2031 else if (max_char < 0x10000)
2032 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2033 PyUnicode_2BYTE_DATA(res));
2034 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002035 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002036 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037 return res;
2038}
2039
2040PyObject*
2041PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2042{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002043 if (size < 0) {
2044 PyErr_SetString(PyExc_ValueError, "size must be positive");
2045 return NULL;
2046 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002047 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002048 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002049 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002051 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002052 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002053 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002054 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002055 PyErr_SetString(PyExc_SystemError, "invalid kind");
2056 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002057 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002058}
2059
Victor Stinnerece58de2012-04-23 23:36:38 +02002060Py_UCS4
2061_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2062{
2063 enum PyUnicode_Kind kind;
2064 void *startptr, *endptr;
2065
2066 assert(PyUnicode_IS_READY(unicode));
2067 assert(0 <= start);
2068 assert(end <= PyUnicode_GET_LENGTH(unicode));
2069 assert(start <= end);
2070
2071 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2072 return PyUnicode_MAX_CHAR_VALUE(unicode);
2073
2074 if (start == end)
2075 return 127;
2076
Victor Stinner94d558b2012-04-27 22:26:58 +02002077 if (PyUnicode_IS_ASCII(unicode))
2078 return 127;
2079
Victor Stinnerece58de2012-04-23 23:36:38 +02002080 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002081 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002082 endptr = (char *)startptr + end * kind;
2083 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002084 switch(kind) {
2085 case PyUnicode_1BYTE_KIND:
2086 return ucs1lib_find_max_char(startptr, endptr);
2087 case PyUnicode_2BYTE_KIND:
2088 return ucs2lib_find_max_char(startptr, endptr);
2089 case PyUnicode_4BYTE_KIND:
2090 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002091 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002092 assert(0);
2093 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002094 }
2095}
2096
Victor Stinner25a4b292011-10-06 12:31:55 +02002097/* Ensure that a string uses the most efficient storage, if it is not the
2098 case: create a new string with of the right kind. Write NULL into *p_unicode
2099 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002100static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002101unicode_adjust_maxchar(PyObject **p_unicode)
2102{
2103 PyObject *unicode, *copy;
2104 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002105 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002106 unsigned int kind;
2107
2108 assert(p_unicode != NULL);
2109 unicode = *p_unicode;
2110 assert(PyUnicode_IS_READY(unicode));
2111 if (PyUnicode_IS_ASCII(unicode))
2112 return;
2113
2114 len = PyUnicode_GET_LENGTH(unicode);
2115 kind = PyUnicode_KIND(unicode);
2116 if (kind == PyUnicode_1BYTE_KIND) {
2117 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002118 max_char = ucs1lib_find_max_char(u, u + len);
2119 if (max_char >= 128)
2120 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002121 }
2122 else if (kind == PyUnicode_2BYTE_KIND) {
2123 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002124 max_char = ucs2lib_find_max_char(u, u + len);
2125 if (max_char >= 256)
2126 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002127 }
2128 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002129 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002130 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002131 max_char = ucs4lib_find_max_char(u, u + len);
2132 if (max_char >= 0x10000)
2133 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002134 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002135 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002136 if (copy != NULL)
2137 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002138 Py_DECREF(unicode);
2139 *p_unicode = copy;
2140}
2141
Victor Stinner034f6cf2011-09-30 02:26:44 +02002142PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002143_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002144{
Victor Stinner87af4f22011-11-21 23:03:47 +01002145 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002146 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002147
Victor Stinner034f6cf2011-09-30 02:26:44 +02002148 if (!PyUnicode_Check(unicode)) {
2149 PyErr_BadInternalCall();
2150 return NULL;
2151 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002152 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002153 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002154
Victor Stinner87af4f22011-11-21 23:03:47 +01002155 length = PyUnicode_GET_LENGTH(unicode);
2156 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002157 if (!copy)
2158 return NULL;
2159 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2160
Victor Stinner87af4f22011-11-21 23:03:47 +01002161 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2162 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002163 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002164 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002165}
2166
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002167
Victor Stinnerbc603d12011-10-02 01:00:40 +02002168/* Widen Unicode objects to larger buffers. Don't write terminating null
2169 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002170
2171void*
2172_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2173{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002174 Py_ssize_t len;
2175 void *result;
2176 unsigned int skind;
2177
Benjamin Petersonbac79492012-01-14 13:34:47 -05002178 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002179 return NULL;
2180
2181 len = PyUnicode_GET_LENGTH(s);
2182 skind = PyUnicode_KIND(s);
2183 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002184 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002185 return NULL;
2186 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002187 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002188 case PyUnicode_2BYTE_KIND:
2189 result = PyMem_Malloc(len * sizeof(Py_UCS2));
2190 if (!result)
2191 return PyErr_NoMemory();
2192 assert(skind == PyUnicode_1BYTE_KIND);
2193 _PyUnicode_CONVERT_BYTES(
2194 Py_UCS1, Py_UCS2,
2195 PyUnicode_1BYTE_DATA(s),
2196 PyUnicode_1BYTE_DATA(s) + len,
2197 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002198 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002199 case PyUnicode_4BYTE_KIND:
2200 result = PyMem_Malloc(len * sizeof(Py_UCS4));
2201 if (!result)
2202 return PyErr_NoMemory();
2203 if (skind == PyUnicode_2BYTE_KIND) {
2204 _PyUnicode_CONVERT_BYTES(
2205 Py_UCS2, Py_UCS4,
2206 PyUnicode_2BYTE_DATA(s),
2207 PyUnicode_2BYTE_DATA(s) + len,
2208 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002209 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002210 else {
2211 assert(skind == PyUnicode_1BYTE_KIND);
2212 _PyUnicode_CONVERT_BYTES(
2213 Py_UCS1, Py_UCS4,
2214 PyUnicode_1BYTE_DATA(s),
2215 PyUnicode_1BYTE_DATA(s) + len,
2216 result);
2217 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002218 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002219 default:
2220 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002221 }
Victor Stinner01698042011-10-04 00:04:26 +02002222 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 return NULL;
2224}
2225
2226static Py_UCS4*
2227as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2228 int copy_null)
2229{
2230 int kind;
2231 void *data;
2232 Py_ssize_t len, targetlen;
2233 if (PyUnicode_READY(string) == -1)
2234 return NULL;
2235 kind = PyUnicode_KIND(string);
2236 data = PyUnicode_DATA(string);
2237 len = PyUnicode_GET_LENGTH(string);
2238 targetlen = len;
2239 if (copy_null)
2240 targetlen++;
2241 if (!target) {
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07002242 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UCS4) < targetlen) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002243 PyErr_NoMemory();
2244 return NULL;
2245 }
2246 target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2247 if (!target) {
2248 PyErr_NoMemory();
2249 return NULL;
2250 }
2251 }
2252 else {
2253 if (targetsize < targetlen) {
2254 PyErr_Format(PyExc_SystemError,
2255 "string is longer than the buffer");
2256 if (copy_null && 0 < targetsize)
2257 target[0] = 0;
2258 return NULL;
2259 }
2260 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002261 if (kind == PyUnicode_1BYTE_KIND) {
2262 Py_UCS1 *start = (Py_UCS1 *) data;
2263 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002264 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002265 else if (kind == PyUnicode_2BYTE_KIND) {
2266 Py_UCS2 *start = (Py_UCS2 *) data;
2267 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2268 }
2269 else {
2270 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002272 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002273 if (copy_null)
2274 target[len] = 0;
2275 return target;
2276}
2277
2278Py_UCS4*
2279PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2280 int copy_null)
2281{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002282 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002283 PyErr_BadInternalCall();
2284 return NULL;
2285 }
2286 return as_ucs4(string, target, targetsize, copy_null);
2287}
2288
2289Py_UCS4*
2290PyUnicode_AsUCS4Copy(PyObject *string)
2291{
2292 return as_ucs4(string, NULL, 0, 1);
2293}
2294
2295#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002296
Alexander Belopolsky40018472011-02-26 01:02:56 +00002297PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002298PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002302 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002303 PyErr_BadInternalCall();
2304 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002305 }
2306
Martin v. Löwis790465f2008-04-05 20:41:37 +00002307 if (size == -1) {
2308 size = wcslen(w);
2309 }
2310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002311 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002312}
2313
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002314#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002315
Victor Stinner15a11362012-10-06 23:48:20 +02002316/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002317 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2318 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2319#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002320
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002321static int
2322unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2323 Py_ssize_t width, Py_ssize_t precision)
2324{
2325 Py_ssize_t length, fill, arglen;
2326 Py_UCS4 maxchar;
2327
2328 if (PyUnicode_READY(str) == -1)
2329 return -1;
2330
2331 length = PyUnicode_GET_LENGTH(str);
2332 if ((precision == -1 || precision >= length)
2333 && width <= length)
2334 return _PyUnicodeWriter_WriteStr(writer, str);
2335
2336 if (precision != -1)
2337 length = Py_MIN(precision, length);
2338
2339 arglen = Py_MAX(length, width);
2340 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2341 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2342 else
2343 maxchar = writer->maxchar;
2344
2345 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2346 return -1;
2347
2348 if (width > length) {
2349 fill = width - length;
2350 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2351 return -1;
2352 writer->pos += fill;
2353 }
2354
2355 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2356 str, 0, length);
2357 writer->pos += length;
2358 return 0;
2359}
2360
2361static int
2362unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2363 Py_ssize_t width, Py_ssize_t precision)
2364{
2365 /* UTF-8 */
2366 Py_ssize_t length;
2367 PyObject *unicode;
2368 int res;
2369
2370 length = strlen(str);
2371 if (precision != -1)
2372 length = Py_MIN(length, precision);
2373 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2374 if (unicode == NULL)
2375 return -1;
2376
2377 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2378 Py_DECREF(unicode);
2379 return res;
2380}
2381
Victor Stinner96865452011-03-01 23:44:09 +00002382static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002383unicode_fromformat_arg(_PyUnicodeWriter *writer,
2384 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002385{
Victor Stinnere215d962012-10-06 23:03:36 +02002386 const char *p;
2387 Py_ssize_t len;
2388 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002389 Py_ssize_t width;
2390 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002391 int longflag;
2392 int longlongflag;
2393 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002394 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002395
2396 p = f;
2397 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002398 zeropad = 0;
2399 if (*f == '0') {
2400 zeropad = 1;
2401 f++;
2402 }
Victor Stinner96865452011-03-01 23:44:09 +00002403
2404 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002405 width = -1;
2406 if (Py_ISDIGIT((unsigned)*f)) {
2407 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002408 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002409 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002410 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002411 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002412 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002413 return NULL;
2414 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002415 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002416 f++;
2417 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002418 }
2419 precision = -1;
2420 if (*f == '.') {
2421 f++;
2422 if (Py_ISDIGIT((unsigned)*f)) {
2423 precision = (*f - '0');
2424 f++;
2425 while (Py_ISDIGIT((unsigned)*f)) {
2426 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2427 PyErr_SetString(PyExc_ValueError,
2428 "precision too big");
2429 return NULL;
2430 }
2431 precision = (precision * 10) + (*f - '0');
2432 f++;
2433 }
2434 }
Victor Stinner96865452011-03-01 23:44:09 +00002435 if (*f == '%') {
2436 /* "%.3%s" => f points to "3" */
2437 f--;
2438 }
2439 }
2440 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002441 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002442 f--;
2443 }
Victor Stinner96865452011-03-01 23:44:09 +00002444
2445 /* Handle %ld, %lu, %lld and %llu. */
2446 longflag = 0;
2447 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002448 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002449 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002450 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002451 longflag = 1;
2452 ++f;
2453 }
2454#ifdef HAVE_LONG_LONG
2455 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002456 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002457 longlongflag = 1;
2458 f += 2;
2459 }
2460#endif
2461 }
2462 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002463 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002464 size_tflag = 1;
2465 ++f;
2466 }
Victor Stinnere215d962012-10-06 23:03:36 +02002467
2468 if (f[1] == '\0')
2469 writer->overallocate = 0;
2470
2471 switch (*f) {
2472 case 'c':
2473 {
2474 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002475 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002476 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002477 "character argument not in range(0x110000)");
2478 return NULL;
2479 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002480 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002481 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002482 break;
2483 }
2484
2485 case 'i':
2486 case 'd':
2487 case 'u':
2488 case 'x':
2489 {
2490 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002491 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002492 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002493
2494 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002495 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002496 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002497 va_arg(*vargs, unsigned long));
2498#ifdef HAVE_LONG_LONG
2499 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002500 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002501 va_arg(*vargs, unsigned PY_LONG_LONG));
2502#endif
2503 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002504 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002505 va_arg(*vargs, size_t));
2506 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002507 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002508 va_arg(*vargs, unsigned int));
2509 }
2510 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002511 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002512 }
2513 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002514 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002515 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002516 va_arg(*vargs, long));
2517#ifdef HAVE_LONG_LONG
2518 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002519 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002520 va_arg(*vargs, PY_LONG_LONG));
2521#endif
2522 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002523 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002524 va_arg(*vargs, Py_ssize_t));
2525 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002526 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002527 va_arg(*vargs, int));
2528 }
2529 assert(len >= 0);
2530
Victor Stinnere215d962012-10-06 23:03:36 +02002531 if (precision < len)
2532 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002533
2534 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002535 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2536 return NULL;
2537
Victor Stinnere215d962012-10-06 23:03:36 +02002538 if (width > precision) {
2539 Py_UCS4 fillchar;
2540 fill = width - precision;
2541 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002542 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2543 return NULL;
2544 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002545 }
Victor Stinner15a11362012-10-06 23:48:20 +02002546 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002547 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002548 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2549 return NULL;
2550 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002551 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002552
Victor Stinner4a587072013-11-19 12:54:53 +01002553 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2554 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002555 break;
2556 }
2557
2558 case 'p':
2559 {
2560 char number[MAX_LONG_LONG_CHARS];
2561
2562 len = sprintf(number, "%p", va_arg(*vargs, void*));
2563 assert(len >= 0);
2564
2565 /* %p is ill-defined: ensure leading 0x. */
2566 if (number[1] == 'X')
2567 number[1] = 'x';
2568 else if (number[1] != 'x') {
2569 memmove(number + 2, number,
2570 strlen(number) + 1);
2571 number[0] = '0';
2572 number[1] = 'x';
2573 len += 2;
2574 }
2575
Victor Stinner4a587072013-11-19 12:54:53 +01002576 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002577 return NULL;
2578 break;
2579 }
2580
2581 case 's':
2582 {
2583 /* UTF-8 */
2584 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002585 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002586 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002587 break;
2588 }
2589
2590 case 'U':
2591 {
2592 PyObject *obj = va_arg(*vargs, PyObject *);
2593 assert(obj && _PyUnicode_CHECK(obj));
2594
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002595 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002596 return NULL;
2597 break;
2598 }
2599
2600 case 'V':
2601 {
2602 PyObject *obj = va_arg(*vargs, PyObject *);
2603 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002604 if (obj) {
2605 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002606 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002607 return NULL;
2608 }
2609 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002610 assert(str != NULL);
2611 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002612 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002613 }
2614 break;
2615 }
2616
2617 case 'S':
2618 {
2619 PyObject *obj = va_arg(*vargs, PyObject *);
2620 PyObject *str;
2621 assert(obj);
2622 str = PyObject_Str(obj);
2623 if (!str)
2624 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002625 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002626 Py_DECREF(str);
2627 return NULL;
2628 }
2629 Py_DECREF(str);
2630 break;
2631 }
2632
2633 case 'R':
2634 {
2635 PyObject *obj = va_arg(*vargs, PyObject *);
2636 PyObject *repr;
2637 assert(obj);
2638 repr = PyObject_Repr(obj);
2639 if (!repr)
2640 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002641 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002642 Py_DECREF(repr);
2643 return NULL;
2644 }
2645 Py_DECREF(repr);
2646 break;
2647 }
2648
2649 case 'A':
2650 {
2651 PyObject *obj = va_arg(*vargs, PyObject *);
2652 PyObject *ascii;
2653 assert(obj);
2654 ascii = PyObject_ASCII(obj);
2655 if (!ascii)
2656 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002657 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002658 Py_DECREF(ascii);
2659 return NULL;
2660 }
2661 Py_DECREF(ascii);
2662 break;
2663 }
2664
2665 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002666 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002667 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002668 break;
2669
2670 default:
2671 /* if we stumble upon an unknown formatting code, copy the rest
2672 of the format string to the output string. (we cannot just
2673 skip the code, since there's no way to know what's in the
2674 argument list) */
2675 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002676 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002677 return NULL;
2678 f = p+len;
2679 return f;
2680 }
2681
2682 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002683 return f;
2684}
2685
Walter Dörwaldd2034312007-05-18 16:29:38 +00002686PyObject *
2687PyUnicode_FromFormatV(const char *format, va_list vargs)
2688{
Victor Stinnere215d962012-10-06 23:03:36 +02002689 va_list vargs2;
2690 const char *f;
2691 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002692
Victor Stinner8f674cc2013-04-17 23:02:17 +02002693 _PyUnicodeWriter_Init(&writer);
2694 writer.min_length = strlen(format) + 100;
2695 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002696
2697 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2698 Copy it to be able to pass a reference to a subfunction. */
2699 Py_VA_COPY(vargs2, vargs);
2700
2701 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002702 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002703 f = unicode_fromformat_arg(&writer, f, &vargs2);
2704 if (f == NULL)
2705 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002706 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002707 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002708 const char *p;
2709 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002710
Victor Stinnere215d962012-10-06 23:03:36 +02002711 p = f;
2712 do
2713 {
2714 if ((unsigned char)*p > 127) {
2715 PyErr_Format(PyExc_ValueError,
2716 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2717 "string, got a non-ASCII byte: 0x%02x",
2718 (unsigned char)*p);
2719 return NULL;
2720 }
2721 p++;
2722 }
2723 while (*p != '\0' && *p != '%');
2724 len = p - f;
2725
2726 if (*p == '\0')
2727 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002728
2729 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002730 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002731
2732 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002733 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002734 }
Victor Stinnere215d962012-10-06 23:03:36 +02002735 return _PyUnicodeWriter_Finish(&writer);
2736
2737 fail:
2738 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002739 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002740}
2741
Walter Dörwaldd2034312007-05-18 16:29:38 +00002742PyObject *
2743PyUnicode_FromFormat(const char *format, ...)
2744{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002745 PyObject* ret;
2746 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002747
2748#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002749 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002750#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002751 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002752#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002753 ret = PyUnicode_FromFormatV(format, vargs);
2754 va_end(vargs);
2755 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002756}
2757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002758#ifdef HAVE_WCHAR_H
2759
Victor Stinner5593d8a2010-10-02 11:11:27 +00002760/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2761 convert a Unicode object to a wide character string.
2762
Victor Stinnerd88d9832011-09-06 02:00:05 +02002763 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002764 character) required to convert the unicode object. Ignore size argument.
2765
Victor Stinnerd88d9832011-09-06 02:00:05 +02002766 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002767 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002768 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002769static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002770unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002771 wchar_t *w,
2772 Py_ssize_t size)
2773{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002774 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002775 const wchar_t *wstr;
2776
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002777 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002778 if (wstr == NULL)
2779 return -1;
2780
Victor Stinner5593d8a2010-10-02 11:11:27 +00002781 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002782 if (size > res)
2783 size = res + 1;
2784 else
2785 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002786 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002787 return res;
2788 }
2789 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002790 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002791}
2792
2793Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002794PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002795 wchar_t *w,
2796 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002797{
2798 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002799 PyErr_BadInternalCall();
2800 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002802 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803}
2804
Victor Stinner137c34c2010-09-29 10:25:54 +00002805wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002806PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002807 Py_ssize_t *size)
2808{
2809 wchar_t* buffer;
2810 Py_ssize_t buflen;
2811
2812 if (unicode == NULL) {
2813 PyErr_BadInternalCall();
2814 return NULL;
2815 }
2816
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002817 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002818 if (buflen == -1)
2819 return NULL;
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07002820 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < buflen) {
Victor Stinner137c34c2010-09-29 10:25:54 +00002821 PyErr_NoMemory();
2822 return NULL;
2823 }
2824
Victor Stinner137c34c2010-09-29 10:25:54 +00002825 buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2826 if (buffer == NULL) {
2827 PyErr_NoMemory();
2828 return NULL;
2829 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002830 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002831 if (buflen == -1) {
2832 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002833 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002834 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002835 if (size != NULL)
2836 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002837 return buffer;
2838}
2839
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002840#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002841
Alexander Belopolsky40018472011-02-26 01:02:56 +00002842PyObject *
2843PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002844{
Victor Stinner8faf8212011-12-08 22:14:11 +01002845 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002846 PyErr_SetString(PyExc_ValueError,
2847 "chr() arg not in range(0x110000)");
2848 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002849 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002850
Victor Stinner985a82a2014-01-03 12:53:47 +01002851 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002852}
2853
Alexander Belopolsky40018472011-02-26 01:02:56 +00002854PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002855PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002856{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002857 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002858 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002859 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002860 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002861 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002862 Py_INCREF(obj);
2863 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002864 }
2865 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002866 /* For a Unicode subtype that's not a Unicode object,
2867 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002868 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002869 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002870 PyErr_Format(PyExc_TypeError,
2871 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002872 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002873 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002874}
2875
Alexander Belopolsky40018472011-02-26 01:02:56 +00002876PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002877PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002878 const char *encoding,
2879 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002880{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002881 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002882 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002883
Guido van Rossumd57fd912000-03-10 22:53:23 +00002884 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002885 PyErr_BadInternalCall();
2886 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002887 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002888
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002889 /* Decoding bytes objects is the most common case and should be fast */
2890 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002891 if (PyBytes_GET_SIZE(obj) == 0)
2892 _Py_RETURN_UNICODE_EMPTY();
2893 v = PyUnicode_Decode(
2894 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2895 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002896 return v;
2897 }
2898
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002899 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002900 PyErr_SetString(PyExc_TypeError,
2901 "decoding str is not supported");
2902 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002903 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002904
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002905 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2906 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2907 PyErr_Format(PyExc_TypeError,
Serhiy Storchakab757c832014-12-05 22:25:22 +02002908 "coercing to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002909 Py_TYPE(obj)->tp_name);
2910 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002911 }
Tim Petersced69f82003-09-16 20:30:58 +00002912
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002913 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002914 PyBuffer_Release(&buffer);
2915 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002916 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002917
Serhiy Storchaka05997252013-01-26 12:14:02 +02002918 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002919 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002920 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002921}
2922
Victor Stinner600d3be2010-06-10 12:00:55 +00002923/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002924 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2925 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002926int
2927_Py_normalize_encoding(const char *encoding,
2928 char *lower,
2929 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002930{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002931 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002932 char *l;
2933 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002934
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002935 if (encoding == NULL) {
Victor Stinner66b32702013-11-07 23:12:23 +01002936 /* 6 == strlen("utf-8") + 1 */
Victor Stinnerdf23e302013-11-07 13:33:36 +01002937 if (lower_len < 6)
2938 return 0;
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002939 strcpy(lower, "utf-8");
2940 return 1;
2941 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002942 e = encoding;
2943 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002944 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002945 while (*e) {
2946 if (l == l_end)
2947 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002948 if (Py_ISUPPER(*e)) {
2949 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002950 }
2951 else if (*e == '_') {
2952 *l++ = '-';
2953 e++;
2954 }
2955 else {
2956 *l++ = *e++;
2957 }
2958 }
2959 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002960 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002961}
2962
Alexander Belopolsky40018472011-02-26 01:02:56 +00002963PyObject *
2964PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002965 Py_ssize_t size,
2966 const char *encoding,
2967 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002968{
2969 PyObject *buffer = NULL, *unicode;
2970 Py_buffer info;
2971 char lower[11]; /* Enough for any encoding shortcut */
2972
Fred Drakee4315f52000-05-09 19:53:39 +00002973 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002974 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002975 if ((strcmp(lower, "utf-8") == 0) ||
2976 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002977 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002978 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002979 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01002980 (strcmp(lower, "iso-8859-1") == 0) ||
2981 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002982 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002983#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002984 else if (strcmp(lower, "mbcs") == 0)
2985 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002986#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002987 else if (strcmp(lower, "ascii") == 0)
2988 return PyUnicode_DecodeASCII(s, size, errors);
2989 else if (strcmp(lower, "utf-16") == 0)
2990 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2991 else if (strcmp(lower, "utf-32") == 0)
2992 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2993 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002994
2995 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002996 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002997 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002998 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002999 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003000 if (buffer == NULL)
3001 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003002 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003003 if (unicode == NULL)
3004 goto onError;
3005 if (!PyUnicode_Check(unicode)) {
3006 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003007 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3008 "use codecs.decode() to decode to arbitrary types",
3009 encoding,
3010 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003011 Py_DECREF(unicode);
3012 goto onError;
3013 }
3014 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003015 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003016
Benjamin Peterson29060642009-01-31 22:14:21 +00003017 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003018 Py_XDECREF(buffer);
3019 return NULL;
3020}
3021
Alexander Belopolsky40018472011-02-26 01:02:56 +00003022PyObject *
3023PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003024 const char *encoding,
3025 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003026{
3027 PyObject *v;
3028
3029 if (!PyUnicode_Check(unicode)) {
3030 PyErr_BadArgument();
3031 goto onError;
3032 }
3033
3034 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003035 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003036
3037 /* Decode via the codec registry */
3038 v = PyCodec_Decode(unicode, encoding, errors);
3039 if (v == NULL)
3040 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003041 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003042
Benjamin Peterson29060642009-01-31 22:14:21 +00003043 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003044 return NULL;
3045}
3046
Alexander Belopolsky40018472011-02-26 01:02:56 +00003047PyObject *
3048PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003049 const char *encoding,
3050 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003051{
3052 PyObject *v;
3053
3054 if (!PyUnicode_Check(unicode)) {
3055 PyErr_BadArgument();
3056 goto onError;
3057 }
3058
3059 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003060 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003061
3062 /* Decode via the codec registry */
3063 v = PyCodec_Decode(unicode, encoding, errors);
3064 if (v == NULL)
3065 goto onError;
3066 if (!PyUnicode_Check(v)) {
3067 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003068 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3069 "use codecs.decode() to decode to arbitrary types",
3070 encoding,
3071 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003072 Py_DECREF(v);
3073 goto onError;
3074 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003075 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003076
Benjamin Peterson29060642009-01-31 22:14:21 +00003077 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003078 return NULL;
3079}
3080
Alexander Belopolsky40018472011-02-26 01:02:56 +00003081PyObject *
3082PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003083 Py_ssize_t size,
3084 const char *encoding,
3085 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003086{
3087 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003088
Guido van Rossumd57fd912000-03-10 22:53:23 +00003089 unicode = PyUnicode_FromUnicode(s, size);
3090 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003091 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003092 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3093 Py_DECREF(unicode);
3094 return v;
3095}
3096
Alexander Belopolsky40018472011-02-26 01:02:56 +00003097PyObject *
3098PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003099 const char *encoding,
3100 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003101{
3102 PyObject *v;
3103
3104 if (!PyUnicode_Check(unicode)) {
3105 PyErr_BadArgument();
3106 goto onError;
3107 }
3108
3109 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003110 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003111
3112 /* Encode via the codec registry */
3113 v = PyCodec_Encode(unicode, encoding, errors);
3114 if (v == NULL)
3115 goto onError;
3116 return v;
3117
Benjamin Peterson29060642009-01-31 22:14:21 +00003118 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003119 return NULL;
3120}
3121
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003122static size_t
3123wcstombs_errorpos(const wchar_t *wstr)
3124{
3125 size_t len;
3126#if SIZEOF_WCHAR_T == 2
3127 wchar_t buf[3];
3128#else
3129 wchar_t buf[2];
3130#endif
3131 char outbuf[MB_LEN_MAX];
3132 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003133
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003134#if SIZEOF_WCHAR_T == 2
3135 buf[2] = 0;
3136#else
3137 buf[1] = 0;
3138#endif
3139 start = wstr;
3140 while (*wstr != L'\0')
3141 {
3142 previous = wstr;
3143#if SIZEOF_WCHAR_T == 2
3144 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3145 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3146 {
3147 buf[0] = wstr[0];
3148 buf[1] = wstr[1];
3149 wstr += 2;
3150 }
3151 else {
3152 buf[0] = *wstr;
3153 buf[1] = 0;
3154 wstr++;
3155 }
3156#else
3157 buf[0] = *wstr;
3158 wstr++;
3159#endif
3160 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003161 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003162 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003163 }
3164
3165 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003166 return 0;
3167}
3168
Victor Stinner1b579672011-12-17 05:47:23 +01003169static int
3170locale_error_handler(const char *errors, int *surrogateescape)
3171{
3172 if (errors == NULL) {
3173 *surrogateescape = 0;
3174 return 0;
3175 }
3176
3177 if (strcmp(errors, "strict") == 0) {
3178 *surrogateescape = 0;
3179 return 0;
3180 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003181 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003182 *surrogateescape = 1;
3183 return 0;
3184 }
3185 PyErr_Format(PyExc_ValueError,
3186 "only 'strict' and 'surrogateescape' error handlers "
3187 "are supported, not '%s'",
3188 errors);
3189 return -1;
3190}
3191
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003192PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003193PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003194{
3195 Py_ssize_t wlen, wlen2;
3196 wchar_t *wstr;
3197 PyObject *bytes = NULL;
3198 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003199 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003200 PyObject *exc;
3201 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003202 int surrogateescape;
3203
3204 if (locale_error_handler(errors, &surrogateescape) < 0)
3205 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003206
3207 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3208 if (wstr == NULL)
3209 return NULL;
3210
3211 wlen2 = wcslen(wstr);
3212 if (wlen2 != wlen) {
3213 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003214 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003215 return NULL;
3216 }
3217
3218 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003219 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003220 char *str;
3221
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003222 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003223 if (str == NULL) {
3224 if (error_pos == (size_t)-1) {
3225 PyErr_NoMemory();
3226 PyMem_Free(wstr);
3227 return NULL;
3228 }
3229 else {
3230 goto encode_error;
3231 }
3232 }
3233 PyMem_Free(wstr);
3234
3235 bytes = PyBytes_FromString(str);
3236 PyMem_Free(str);
3237 }
3238 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003239 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003240 size_t len, len2;
3241
3242 len = wcstombs(NULL, wstr, 0);
3243 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003244 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003245 goto encode_error;
3246 }
3247
3248 bytes = PyBytes_FromStringAndSize(NULL, len);
3249 if (bytes == NULL) {
3250 PyMem_Free(wstr);
3251 return NULL;
3252 }
3253
3254 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3255 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003256 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003257 goto encode_error;
3258 }
3259 PyMem_Free(wstr);
3260 }
3261 return bytes;
3262
3263encode_error:
3264 errmsg = strerror(errno);
3265 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003266
3267 if (error_pos == (size_t)-1)
3268 error_pos = wcstombs_errorpos(wstr);
3269
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003270 PyMem_Free(wstr);
3271 Py_XDECREF(bytes);
3272
Victor Stinner2f197072011-12-17 07:08:30 +01003273 if (errmsg != NULL) {
3274 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003275 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003276 if (wstr != NULL) {
3277 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003278 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003279 } else
3280 errmsg = NULL;
3281 }
3282 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003283 reason = PyUnicode_FromString(
3284 "wcstombs() encountered an unencodable "
3285 "wide character");
3286 if (reason == NULL)
3287 return NULL;
3288
3289 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3290 "locale", unicode,
3291 (Py_ssize_t)error_pos,
3292 (Py_ssize_t)(error_pos+1),
3293 reason);
3294 Py_DECREF(reason);
3295 if (exc != NULL) {
3296 PyCodec_StrictErrors(exc);
3297 Py_XDECREF(exc);
3298 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003299 return NULL;
3300}
3301
Victor Stinnerad158722010-10-27 00:25:46 +00003302PyObject *
3303PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003304{
Victor Stinner99b95382011-07-04 14:23:54 +02003305#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003306 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003307#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003308 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003309#else
Victor Stinner793b5312011-04-27 00:24:21 +02003310 PyInterpreterState *interp = PyThreadState_GET()->interp;
3311 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3312 cannot use it to encode and decode filenames before it is loaded. Load
3313 the Python codec requires to encode at least its own filename. Use the C
3314 version of the locale codec until the codec registry is initialized and
3315 the Python codec is loaded.
3316
3317 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3318 cannot only rely on it: check also interp->fscodec_initialized for
3319 subinterpreters. */
3320 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003321 return PyUnicode_AsEncodedString(unicode,
3322 Py_FileSystemDefaultEncoding,
3323 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003324 }
3325 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003326 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003327 }
Victor Stinnerad158722010-10-27 00:25:46 +00003328#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003329}
3330
Alexander Belopolsky40018472011-02-26 01:02:56 +00003331PyObject *
3332PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003333 const char *encoding,
3334 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003335{
3336 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003337 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003338
Guido van Rossumd57fd912000-03-10 22:53:23 +00003339 if (!PyUnicode_Check(unicode)) {
3340 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003341 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003342 }
Fred Drakee4315f52000-05-09 19:53:39 +00003343
Fred Drakee4315f52000-05-09 19:53:39 +00003344 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003345 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003346 if ((strcmp(lower, "utf-8") == 0) ||
3347 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003348 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003349 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003350 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003351 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003352 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003353 }
Victor Stinner37296e82010-06-10 13:36:23 +00003354 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003355 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003356 (strcmp(lower, "iso-8859-1") == 0) ||
3357 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003358 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003359#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003360 else if (strcmp(lower, "mbcs") == 0)
3361 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003362#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003363 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003364 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003365 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366
3367 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003368 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003369 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003370 return NULL;
3371
3372 /* The normal path */
3373 if (PyBytes_Check(v))
3374 return v;
3375
3376 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003377 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003378 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003379 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003380
3381 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003382 "encoder %s returned bytearray instead of bytes; "
3383 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003384 encoding);
3385 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003386 Py_DECREF(v);
3387 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003388 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003389
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003390 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3391 Py_DECREF(v);
3392 return b;
3393 }
3394
3395 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003396 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3397 "use codecs.encode() to encode to arbitrary types",
3398 encoding,
3399 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003400 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003401 return NULL;
3402}
3403
Alexander Belopolsky40018472011-02-26 01:02:56 +00003404PyObject *
3405PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003406 const char *encoding,
3407 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003408{
3409 PyObject *v;
3410
3411 if (!PyUnicode_Check(unicode)) {
3412 PyErr_BadArgument();
3413 goto onError;
3414 }
3415
3416 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003417 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003418
3419 /* Encode via the codec registry */
3420 v = PyCodec_Encode(unicode, encoding, errors);
3421 if (v == NULL)
3422 goto onError;
3423 if (!PyUnicode_Check(v)) {
3424 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003425 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3426 "use codecs.encode() to encode to arbitrary types",
3427 encoding,
3428 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003429 Py_DECREF(v);
3430 goto onError;
3431 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003432 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003433
Benjamin Peterson29060642009-01-31 22:14:21 +00003434 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003435 return NULL;
3436}
3437
Victor Stinner2f197072011-12-17 07:08:30 +01003438static size_t
3439mbstowcs_errorpos(const char *str, size_t len)
3440{
3441#ifdef HAVE_MBRTOWC
3442 const char *start = str;
3443 mbstate_t mbs;
3444 size_t converted;
3445 wchar_t ch;
3446
3447 memset(&mbs, 0, sizeof mbs);
3448 while (len)
3449 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003450 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003451 if (converted == 0)
3452 /* Reached end of string */
3453 break;
3454 if (converted == (size_t)-1 || converted == (size_t)-2) {
3455 /* Conversion error or incomplete character */
3456 return str - start;
3457 }
3458 else {
3459 str += converted;
3460 len -= converted;
3461 }
3462 }
3463 /* failed to find the undecodable byte sequence */
3464 return 0;
3465#endif
3466 return 0;
3467}
3468
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003469PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003470PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003471 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003472{
3473 wchar_t smallbuf[256];
3474 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3475 wchar_t *wstr;
3476 size_t wlen, wlen2;
3477 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003478 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003479 size_t error_pos;
3480 char *errmsg;
3481 PyObject *reason, *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003482
3483 if (locale_error_handler(errors, &surrogateescape) < 0)
3484 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003485
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003486 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3487 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003488 return NULL;
3489 }
3490
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003491 if (surrogateescape) {
3492 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003493 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003494 if (wstr == NULL) {
3495 if (wlen == (size_t)-1)
3496 PyErr_NoMemory();
3497 else
3498 PyErr_SetFromErrno(PyExc_OSError);
3499 return NULL;
3500 }
3501
3502 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003503 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003504 }
3505 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003506 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003507#ifndef HAVE_BROKEN_MBSTOWCS
3508 wlen = mbstowcs(NULL, str, 0);
3509#else
3510 wlen = len;
3511#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003512 if (wlen == (size_t)-1)
3513 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003514 if (wlen+1 <= smallbuf_len) {
3515 wstr = smallbuf;
3516 }
3517 else {
3518 if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3519 return PyErr_NoMemory();
3520
3521 wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3522 if (!wstr)
3523 return PyErr_NoMemory();
3524 }
3525
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003526 wlen2 = mbstowcs(wstr, str, wlen+1);
3527 if (wlen2 == (size_t)-1) {
3528 if (wstr != smallbuf)
3529 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003530 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003531 }
3532#ifdef HAVE_BROKEN_MBSTOWCS
3533 assert(wlen2 == wlen);
3534#endif
3535 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3536 if (wstr != smallbuf)
3537 PyMem_Free(wstr);
3538 }
3539 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003540
3541decode_error:
3542 errmsg = strerror(errno);
3543 assert(errmsg != NULL);
3544
3545 error_pos = mbstowcs_errorpos(str, len);
3546 if (errmsg != NULL) {
3547 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003548 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003549 if (wstr != NULL) {
3550 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003551 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003552 } else
3553 errmsg = NULL;
3554 }
3555 if (errmsg == NULL)
3556 reason = PyUnicode_FromString(
3557 "mbstowcs() encountered an invalid multibyte sequence");
3558 if (reason == NULL)
3559 return NULL;
3560
3561 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3562 "locale", str, len,
3563 (Py_ssize_t)error_pos,
3564 (Py_ssize_t)(error_pos+1),
3565 reason);
3566 Py_DECREF(reason);
3567 if (exc != NULL) {
3568 PyCodec_StrictErrors(exc);
3569 Py_XDECREF(exc);
3570 }
3571 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003572}
3573
3574PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003575PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003576{
3577 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003578 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003579}
3580
3581
3582PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003583PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003584 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003585 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3586}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003587
Christian Heimes5894ba72007-11-04 11:43:14 +00003588PyObject*
3589PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3590{
Victor Stinner99b95382011-07-04 14:23:54 +02003591#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003592 return PyUnicode_DecodeMBCS(s, size, NULL);
3593#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003594 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003595#else
Victor Stinner793b5312011-04-27 00:24:21 +02003596 PyInterpreterState *interp = PyThreadState_GET()->interp;
3597 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3598 cannot use it to encode and decode filenames before it is loaded. Load
3599 the Python codec requires to encode at least its own filename. Use the C
3600 version of the locale codec until the codec registry is initialized and
3601 the Python codec is loaded.
3602
3603 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3604 cannot only rely on it: check also interp->fscodec_initialized for
3605 subinterpreters. */
3606 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003607 return PyUnicode_Decode(s, size,
3608 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003609 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003610 }
3611 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003612 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003613 }
Victor Stinnerad158722010-10-27 00:25:46 +00003614#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003615}
3616
Martin v. Löwis011e8422009-05-05 04:43:17 +00003617
3618int
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003619_PyUnicode_HasNULChars(PyObject* str)
Antoine Pitrou13348842012-01-29 18:36:34 +01003620{
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003621 Py_ssize_t pos;
Antoine Pitrou13348842012-01-29 18:36:34 +01003622
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003623 if (PyUnicode_READY(str) == -1)
Antoine Pitrou13348842012-01-29 18:36:34 +01003624 return -1;
Victor Stinnerfe75fb42012-10-23 02:52:18 +02003625 pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3626 PyUnicode_GET_LENGTH(str), '\0', 1);
3627 if (pos == -1)
3628 return 0;
3629 else
3630 return 1;
Antoine Pitrou13348842012-01-29 18:36:34 +01003631}
3632
Antoine Pitrou13348842012-01-29 18:36:34 +01003633int
Martin v. Löwis011e8422009-05-05 04:43:17 +00003634PyUnicode_FSConverter(PyObject* arg, void* addr)
3635{
3636 PyObject *output = NULL;
3637 Py_ssize_t size;
3638 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003639 if (arg == NULL) {
3640 Py_DECREF(*(PyObject**)addr);
3641 return 1;
3642 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003643 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003644 output = arg;
3645 Py_INCREF(output);
3646 }
3647 else {
3648 arg = PyUnicode_FromObject(arg);
3649 if (!arg)
3650 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003651 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003652 Py_DECREF(arg);
3653 if (!output)
3654 return 0;
3655 if (!PyBytes_Check(output)) {
3656 Py_DECREF(output);
3657 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3658 return 0;
3659 }
3660 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003661 size = PyBytes_GET_SIZE(output);
3662 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003663 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003664 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003665 Py_DECREF(output);
3666 return 0;
3667 }
3668 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003669 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003670}
3671
3672
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003673int
3674PyUnicode_FSDecoder(PyObject* arg, void* addr)
3675{
3676 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003677 if (arg == NULL) {
3678 Py_DECREF(*(PyObject**)addr);
3679 return 1;
3680 }
3681 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003682 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003683 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003684 output = arg;
3685 Py_INCREF(output);
3686 }
3687 else {
3688 arg = PyBytes_FromObject(arg);
3689 if (!arg)
3690 return 0;
3691 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3692 PyBytes_GET_SIZE(arg));
3693 Py_DECREF(arg);
3694 if (!output)
3695 return 0;
3696 if (!PyUnicode_Check(output)) {
3697 Py_DECREF(output);
3698 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3699 return 0;
3700 }
3701 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003702 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003703 Py_DECREF(output);
3704 return 0;
3705 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003706 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003707 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003708 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003709 Py_DECREF(output);
3710 return 0;
3711 }
3712 *(PyObject**)addr = output;
3713 return Py_CLEANUP_SUPPORTED;
3714}
3715
3716
Martin v. Löwis5b222132007-06-10 09:51:05 +00003717char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003718PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003719{
Christian Heimesf3863112007-11-22 07:46:41 +00003720 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003721
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003722 if (!PyUnicode_Check(unicode)) {
3723 PyErr_BadArgument();
3724 return NULL;
3725 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003726 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003727 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003728
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003729 if (PyUnicode_UTF8(unicode) == NULL) {
3730 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003731 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3732 if (bytes == NULL)
3733 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003734 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3735 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003736 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003737 Py_DECREF(bytes);
3738 return NULL;
3739 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003740 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3741 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3742 PyBytes_AS_STRING(bytes),
3743 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003744 Py_DECREF(bytes);
3745 }
3746
3747 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003748 *psize = PyUnicode_UTF8_LENGTH(unicode);
3749 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003750}
3751
3752char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003753PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003754{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003755 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3756}
3757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003758Py_UNICODE *
3759PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3760{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003761 const unsigned char *one_byte;
3762#if SIZEOF_WCHAR_T == 4
3763 const Py_UCS2 *two_bytes;
3764#else
3765 const Py_UCS4 *four_bytes;
3766 const Py_UCS4 *ucs4_end;
3767 Py_ssize_t num_surrogates;
3768#endif
3769 wchar_t *w;
3770 wchar_t *wchar_end;
3771
3772 if (!PyUnicode_Check(unicode)) {
3773 PyErr_BadArgument();
3774 return NULL;
3775 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003776 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003777 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003778 assert(_PyUnicode_KIND(unicode) != 0);
3779 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003780
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003781 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003782#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003783 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3784 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003785 num_surrogates = 0;
3786
3787 for (; four_bytes < ucs4_end; ++four_bytes) {
3788 if (*four_bytes > 0xFFFF)
3789 ++num_surrogates;
3790 }
3791
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003792 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3793 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3794 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003795 PyErr_NoMemory();
3796 return NULL;
3797 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003798 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003799
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003800 w = _PyUnicode_WSTR(unicode);
3801 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3802 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003803 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3804 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003805 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003806 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003807 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3808 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003809 }
3810 else
3811 *w = *four_bytes;
3812
3813 if (w > wchar_end) {
3814 assert(0 && "Miscalculated string end");
3815 }
3816 }
3817 *w = 0;
3818#else
3819 /* sizeof(wchar_t) == 4 */
3820 Py_FatalError("Impossible unicode object state, wstr and str "
3821 "should share memory already.");
3822 return NULL;
3823#endif
3824 }
3825 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003826 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3827 (_PyUnicode_LENGTH(unicode) + 1));
3828 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003829 PyErr_NoMemory();
3830 return NULL;
3831 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003832 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3833 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3834 w = _PyUnicode_WSTR(unicode);
3835 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003836
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003837 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3838 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003839 for (; w < wchar_end; ++one_byte, ++w)
3840 *w = *one_byte;
3841 /* null-terminate the wstr */
3842 *w = 0;
3843 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003844 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003845#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003846 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003847 for (; w < wchar_end; ++two_bytes, ++w)
3848 *w = *two_bytes;
3849 /* null-terminate the wstr */
3850 *w = 0;
3851#else
3852 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003853 PyObject_FREE(_PyUnicode_WSTR(unicode));
3854 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003855 Py_FatalError("Impossible unicode object state, wstr "
3856 "and str should share memory already.");
3857 return NULL;
3858#endif
3859 }
3860 else {
3861 assert(0 && "This should never happen.");
3862 }
3863 }
3864 }
3865 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003866 *size = PyUnicode_WSTR_LENGTH(unicode);
3867 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003868}
3869
Alexander Belopolsky40018472011-02-26 01:02:56 +00003870Py_UNICODE *
3871PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003872{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003873 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003874}
3875
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003876
Alexander Belopolsky40018472011-02-26 01:02:56 +00003877Py_ssize_t
3878PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003879{
3880 if (!PyUnicode_Check(unicode)) {
3881 PyErr_BadArgument();
3882 goto onError;
3883 }
3884 return PyUnicode_GET_SIZE(unicode);
3885
Benjamin Peterson29060642009-01-31 22:14:21 +00003886 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003887 return -1;
3888}
3889
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003890Py_ssize_t
3891PyUnicode_GetLength(PyObject *unicode)
3892{
Victor Stinner07621332012-06-16 04:53:46 +02003893 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003894 PyErr_BadArgument();
3895 return -1;
3896 }
Victor Stinner07621332012-06-16 04:53:46 +02003897 if (PyUnicode_READY(unicode) == -1)
3898 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003899 return PyUnicode_GET_LENGTH(unicode);
3900}
3901
3902Py_UCS4
3903PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3904{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003905 void *data;
3906 int kind;
3907
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003908 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3909 PyErr_BadArgument();
3910 return (Py_UCS4)-1;
3911 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003912 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003913 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003914 return (Py_UCS4)-1;
3915 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003916 data = PyUnicode_DATA(unicode);
3917 kind = PyUnicode_KIND(unicode);
3918 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003919}
3920
3921int
3922PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3923{
3924 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003925 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003926 return -1;
3927 }
Victor Stinner488fa492011-12-12 00:01:39 +01003928 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003929 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003930 PyErr_SetString(PyExc_IndexError, "string index out of range");
3931 return -1;
3932 }
Victor Stinner488fa492011-12-12 00:01:39 +01003933 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003934 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003935 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3936 PyErr_SetString(PyExc_ValueError, "character out of range");
3937 return -1;
3938 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003939 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3940 index, ch);
3941 return 0;
3942}
3943
Alexander Belopolsky40018472011-02-26 01:02:56 +00003944const char *
3945PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003946{
Victor Stinner42cb4622010-09-01 19:39:01 +00003947 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003948}
3949
Victor Stinner554f3f02010-06-16 23:33:54 +00003950/* create or adjust a UnicodeDecodeError */
3951static void
3952make_decode_exception(PyObject **exceptionObject,
3953 const char *encoding,
3954 const char *input, Py_ssize_t length,
3955 Py_ssize_t startpos, Py_ssize_t endpos,
3956 const char *reason)
3957{
3958 if (*exceptionObject == NULL) {
3959 *exceptionObject = PyUnicodeDecodeError_Create(
3960 encoding, input, length, startpos, endpos, reason);
3961 }
3962 else {
3963 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3964 goto onError;
3965 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3966 goto onError;
3967 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3968 goto onError;
3969 }
3970 return;
3971
3972onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02003973 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00003974}
3975
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003976#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003977/* error handling callback helper:
3978 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003979 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003980 and adjust various state variables.
3981 return 0 on success, -1 on error
3982*/
3983
Alexander Belopolsky40018472011-02-26 01:02:56 +00003984static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003985unicode_decode_call_errorhandler_wchar(
3986 const char *errors, PyObject **errorHandler,
3987 const char *encoding, const char *reason,
3988 const char **input, const char **inend, Py_ssize_t *startinpos,
3989 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3990 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003991{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003992 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003993
3994 PyObject *restuple = NULL;
3995 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003996 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003997 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003998 Py_ssize_t requiredsize;
3999 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004000 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004001 wchar_t *repwstr;
4002 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004003
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004004 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4005 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01004006
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004007 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004008 *errorHandler = PyCodec_LookupError(errors);
4009 if (*errorHandler == NULL)
4010 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004011 }
4012
Victor Stinner554f3f02010-06-16 23:33:54 +00004013 make_decode_exception(exceptionObject,
4014 encoding,
4015 *input, *inend - *input,
4016 *startinpos, *endinpos,
4017 reason);
4018 if (*exceptionObject == NULL)
4019 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004020
4021 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4022 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004023 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004024 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004025 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004026 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004027 }
4028 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004029 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004030
4031 /* Copy back the bytes variables, which might have been modified by the
4032 callback */
4033 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4034 if (!inputobj)
4035 goto onError;
4036 if (!PyBytes_Check(inputobj)) {
4037 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4038 }
4039 *input = PyBytes_AS_STRING(inputobj);
4040 insize = PyBytes_GET_SIZE(inputobj);
4041 *inend = *input + insize;
4042 /* we can DECREF safely, as the exception has another reference,
4043 so the object won't go away. */
4044 Py_DECREF(inputobj);
4045
4046 if (newpos<0)
4047 newpos = insize+newpos;
4048 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004049 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004050 goto onError;
4051 }
4052
4053 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4054 if (repwstr == NULL)
4055 goto onError;
4056 /* need more space? (at least enough for what we
4057 have+the replacement+the rest of the string (starting
4058 at the new input position), so we won't have to check space
4059 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004060 requiredsize = *outpos;
4061 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4062 goto overflow;
4063 requiredsize += repwlen;
4064 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4065 goto overflow;
4066 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004067 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004068 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004069 requiredsize = 2*outsize;
4070 if (unicode_resize(output, requiredsize) < 0)
4071 goto onError;
4072 }
4073 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4074 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004075 *endinpos = newpos;
4076 *inptr = *input + newpos;
4077
4078 /* we made it! */
4079 Py_XDECREF(restuple);
4080 return 0;
4081
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004082 overflow:
4083 PyErr_SetString(PyExc_OverflowError,
4084 "decoded result is too long for a Python string");
4085
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004086 onError:
4087 Py_XDECREF(restuple);
4088 return -1;
4089}
4090#endif /* HAVE_MBCS */
4091
4092static int
4093unicode_decode_call_errorhandler_writer(
4094 const char *errors, PyObject **errorHandler,
4095 const char *encoding, const char *reason,
4096 const char **input, const char **inend, Py_ssize_t *startinpos,
4097 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4098 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4099{
4100 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4101
4102 PyObject *restuple = NULL;
4103 PyObject *repunicode = NULL;
4104 Py_ssize_t insize;
4105 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004106 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004107 PyObject *inputobj = NULL;
4108
4109 if (*errorHandler == NULL) {
4110 *errorHandler = PyCodec_LookupError(errors);
4111 if (*errorHandler == NULL)
4112 goto onError;
4113 }
4114
4115 make_decode_exception(exceptionObject,
4116 encoding,
4117 *input, *inend - *input,
4118 *startinpos, *endinpos,
4119 reason);
4120 if (*exceptionObject == NULL)
4121 goto onError;
4122
4123 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4124 if (restuple == NULL)
4125 goto onError;
4126 if (!PyTuple_Check(restuple)) {
4127 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4128 goto onError;
4129 }
4130 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004131 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004132
4133 /* Copy back the bytes variables, which might have been modified by the
4134 callback */
4135 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4136 if (!inputobj)
4137 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004138 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004139 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004140 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004141 *input = PyBytes_AS_STRING(inputobj);
4142 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004143 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004144 /* we can DECREF safely, as the exception has another reference,
4145 so the object won't go away. */
4146 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004147
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004148 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004149 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004150 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004151 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004152 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004153 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004154
Victor Stinner8f674cc2013-04-17 23:02:17 +02004155 if (PyUnicode_READY(repunicode) < 0)
4156 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004157 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004158 if (replen > 1) {
4159 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004160 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004161 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4162 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4163 goto onError;
4164 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004165 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004166 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004167
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004168 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004169 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004170
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004171 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004172 Py_XDECREF(restuple);
4173 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004174
Benjamin Peterson29060642009-01-31 22:14:21 +00004175 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004176 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004177 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004178}
4179
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004180/* --- UTF-7 Codec -------------------------------------------------------- */
4181
Antoine Pitrou244651a2009-05-04 18:56:13 +00004182/* See RFC2152 for details. We encode conservatively and decode liberally. */
4183
4184/* Three simple macros defining base-64. */
4185
4186/* Is c a base-64 character? */
4187
4188#define IS_BASE64(c) \
4189 (((c) >= 'A' && (c) <= 'Z') || \
4190 ((c) >= 'a' && (c) <= 'z') || \
4191 ((c) >= '0' && (c) <= '9') || \
4192 (c) == '+' || (c) == '/')
4193
4194/* given that c is a base-64 character, what is its base-64 value? */
4195
4196#define FROM_BASE64(c) \
4197 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4198 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4199 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4200 (c) == '+' ? 62 : 63)
4201
4202/* What is the base-64 character of the bottom 6 bits of n? */
4203
4204#define TO_BASE64(n) \
4205 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4206
4207/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4208 * decoded as itself. We are permissive on decoding; the only ASCII
4209 * byte not decoding to itself is the + which begins a base64
4210 * string. */
4211
4212#define DECODE_DIRECT(c) \
4213 ((c) <= 127 && (c) != '+')
4214
4215/* The UTF-7 encoder treats ASCII characters differently according to
4216 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4217 * the above). See RFC2152. This array identifies these different
4218 * sets:
4219 * 0 : "Set D"
4220 * alphanumeric and '(),-./:?
4221 * 1 : "Set O"
4222 * !"#$%&*;<=>@[]^_`{|}
4223 * 2 : "whitespace"
4224 * ht nl cr sp
4225 * 3 : special (must be base64 encoded)
4226 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4227 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004228
Tim Petersced69f82003-09-16 20:30:58 +00004229static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004230char utf7_category[128] = {
4231/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4232 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4233/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4234 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4235/* sp ! " # $ % & ' ( ) * + , - . / */
4236 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4237/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4238 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4239/* @ A B C D E F G H I J K L M N O */
4240 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4241/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4242 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4243/* ` a b c d e f g h i j k l m n o */
4244 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4245/* p q r s t u v w x y z { | } ~ del */
4246 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004247};
4248
Antoine Pitrou244651a2009-05-04 18:56:13 +00004249/* ENCODE_DIRECT: this character should be encoded as itself. The
4250 * answer depends on whether we are encoding set O as itself, and also
4251 * on whether we are encoding whitespace as itself. RFC2152 makes it
4252 * clear that the answers to these questions vary between
4253 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004254
Antoine Pitrou244651a2009-05-04 18:56:13 +00004255#define ENCODE_DIRECT(c, directO, directWS) \
4256 ((c) < 128 && (c) > 0 && \
4257 ((utf7_category[(c)] == 0) || \
4258 (directWS && (utf7_category[(c)] == 2)) || \
4259 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004260
Alexander Belopolsky40018472011-02-26 01:02:56 +00004261PyObject *
4262PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004263 Py_ssize_t size,
4264 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004265{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004266 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4267}
4268
Antoine Pitrou244651a2009-05-04 18:56:13 +00004269/* The decoder. The only state we preserve is our read position,
4270 * i.e. how many characters we have consumed. So if we end in the
4271 * middle of a shift sequence we have to back off the read position
4272 * and the output to the beginning of the sequence, otherwise we lose
4273 * all the shift state (seen bits, number of bits seen, high
4274 * surrogate). */
4275
Alexander Belopolsky40018472011-02-26 01:02:56 +00004276PyObject *
4277PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004278 Py_ssize_t size,
4279 const char *errors,
4280 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004281{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004282 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004283 Py_ssize_t startinpos;
4284 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004285 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004286 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004287 const char *errmsg = "";
4288 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004289 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004290 unsigned int base64bits = 0;
4291 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004292 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004293 PyObject *errorHandler = NULL;
4294 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004295
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004296 if (size == 0) {
4297 if (consumed)
4298 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004299 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004300 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004301
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004302 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004303 _PyUnicodeWriter_Init(&writer);
4304 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004305
4306 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004307 e = s + size;
4308
4309 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004310 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004311 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004312 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004313
Antoine Pitrou244651a2009-05-04 18:56:13 +00004314 if (inShift) { /* in a base-64 section */
4315 if (IS_BASE64(ch)) { /* consume a base-64 character */
4316 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4317 base64bits += 6;
4318 s++;
4319 if (base64bits >= 16) {
4320 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004321 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004322 base64bits -= 16;
4323 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004324 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004325 if (surrogate) {
4326 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004327 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4328 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004329 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004330 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004331 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004332 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004333 }
4334 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004335 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004336 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004337 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004338 }
4339 }
Victor Stinner551ac952011-11-29 22:58:13 +01004340 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004341 /* first surrogate */
4342 surrogate = outCh;
4343 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004344 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004345 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004346 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004347 }
4348 }
4349 }
4350 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004351 inShift = 0;
4352 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004353 if (surrogate) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004354 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004355 goto onError;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004356 surrogate = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004357 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004358 if (base64bits > 0) { /* left-over bits */
4359 if (base64bits >= 6) {
4360 /* We've seen at least one base-64 character */
4361 errmsg = "partial character in shift sequence";
4362 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004363 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004364 else {
4365 /* Some bits remain; they should be zero */
4366 if (base64buffer != 0) {
4367 errmsg = "non-zero padding bits in shift sequence";
4368 goto utf7Error;
4369 }
4370 }
4371 }
4372 if (ch != '-') {
4373 /* '-' is absorbed; other terminating
4374 characters are preserved */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004375 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004376 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004377 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004378 }
4379 }
4380 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004381 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004382 s++; /* consume '+' */
4383 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004384 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004385 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004386 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004387 }
4388 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004389 inShift = 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004390 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004391 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004392 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004393 }
4394 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004395 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004396 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004397 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004398 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004399 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004400 else {
4401 startinpos = s-starts;
4402 s++;
4403 errmsg = "unexpected special character";
4404 goto utf7Error;
4405 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004406 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004407utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004408 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004409 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004410 errors, &errorHandler,
4411 "utf7", errmsg,
4412 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004413 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004414 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004415 }
4416
Antoine Pitrou244651a2009-05-04 18:56:13 +00004417 /* end of string */
4418
4419 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4420 /* if we're in an inconsistent state, that's an error */
4421 if (surrogate ||
4422 (base64bits >= 6) ||
4423 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004424 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004425 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004426 errors, &errorHandler,
4427 "utf7", "unterminated shift sequence",
4428 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004429 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004430 goto onError;
4431 if (s < e)
4432 goto restart;
4433 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004434 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004435
4436 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004437 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004438 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004439 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004440 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004441 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004442 writer.kind, writer.data, shiftOutStart);
4443 Py_XDECREF(errorHandler);
4444 Py_XDECREF(exc);
4445 _PyUnicodeWriter_Dealloc(&writer);
4446 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004447 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004448 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004449 }
4450 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004451 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004452 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004453 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004454
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004455 Py_XDECREF(errorHandler);
4456 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004457 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004458
Benjamin Peterson29060642009-01-31 22:14:21 +00004459 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004460 Py_XDECREF(errorHandler);
4461 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004462 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004463 return NULL;
4464}
4465
4466
Alexander Belopolsky40018472011-02-26 01:02:56 +00004467PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004468_PyUnicode_EncodeUTF7(PyObject *str,
4469 int base64SetO,
4470 int base64WhiteSpace,
4471 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004472{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004473 int kind;
4474 void *data;
4475 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004476 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004477 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004478 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004479 unsigned int base64bits = 0;
4480 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004481 char * out;
4482 char * start;
4483
Benjamin Petersonbac79492012-01-14 13:34:47 -05004484 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004485 return NULL;
4486 kind = PyUnicode_KIND(str);
4487 data = PyUnicode_DATA(str);
4488 len = PyUnicode_GET_LENGTH(str);
4489
4490 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004491 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004492
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004493 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004494 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004495 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004496 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004497 if (v == NULL)
4498 return NULL;
4499
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004500 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004501 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004502 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004503
Antoine Pitrou244651a2009-05-04 18:56:13 +00004504 if (inShift) {
4505 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4506 /* shifting out */
4507 if (base64bits) { /* output remaining bits */
4508 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4509 base64buffer = 0;
4510 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004511 }
4512 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004513 /* Characters not in the BASE64 set implicitly unshift the sequence
4514 so no '-' is required, except if the character is itself a '-' */
4515 if (IS_BASE64(ch) || ch == '-') {
4516 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004517 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004518 *out++ = (char) ch;
4519 }
4520 else {
4521 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004522 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004523 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004524 else { /* not in a shift sequence */
4525 if (ch == '+') {
4526 *out++ = '+';
4527 *out++ = '-';
4528 }
4529 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4530 *out++ = (char) ch;
4531 }
4532 else {
4533 *out++ = '+';
4534 inShift = 1;
4535 goto encode_char;
4536 }
4537 }
4538 continue;
4539encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004540 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004541 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004542
Antoine Pitrou244651a2009-05-04 18:56:13 +00004543 /* code first surrogate */
4544 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004545 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004546 while (base64bits >= 6) {
4547 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4548 base64bits -= 6;
4549 }
4550 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004551 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004552 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004553 base64bits += 16;
4554 base64buffer = (base64buffer << 16) | ch;
4555 while (base64bits >= 6) {
4556 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4557 base64bits -= 6;
4558 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004559 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004560 if (base64bits)
4561 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4562 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004563 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004564 if (_PyBytes_Resize(&v, out - start) < 0)
4565 return NULL;
4566 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004567}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004568PyObject *
4569PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4570 Py_ssize_t size,
4571 int base64SetO,
4572 int base64WhiteSpace,
4573 const char *errors)
4574{
4575 PyObject *result;
4576 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4577 if (tmp == NULL)
4578 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004579 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004580 base64WhiteSpace, errors);
4581 Py_DECREF(tmp);
4582 return result;
4583}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004584
Antoine Pitrou244651a2009-05-04 18:56:13 +00004585#undef IS_BASE64
4586#undef FROM_BASE64
4587#undef TO_BASE64
4588#undef DECODE_DIRECT
4589#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004590
Guido van Rossumd57fd912000-03-10 22:53:23 +00004591/* --- UTF-8 Codec -------------------------------------------------------- */
4592
Alexander Belopolsky40018472011-02-26 01:02:56 +00004593PyObject *
4594PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004595 Py_ssize_t size,
4596 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004597{
Walter Dörwald69652032004-09-07 20:24:22 +00004598 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4599}
4600
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004601#include "stringlib/asciilib.h"
4602#include "stringlib/codecs.h"
4603#include "stringlib/undef.h"
4604
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004605#include "stringlib/ucs1lib.h"
4606#include "stringlib/codecs.h"
4607#include "stringlib/undef.h"
4608
4609#include "stringlib/ucs2lib.h"
4610#include "stringlib/codecs.h"
4611#include "stringlib/undef.h"
4612
4613#include "stringlib/ucs4lib.h"
4614#include "stringlib/codecs.h"
4615#include "stringlib/undef.h"
4616
Antoine Pitrouab868312009-01-10 15:40:25 +00004617/* Mask to quickly check whether a C 'long' contains a
4618 non-ASCII, UTF8-encoded char. */
4619#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004620# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004621#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004622# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004623#else
4624# error C 'long' size should be either 4 or 8!
4625#endif
4626
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004627static Py_ssize_t
4628ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004629{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004630 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004631 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004632
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004633 /*
4634 * Issue #17237: m68k is a bit different from most architectures in
4635 * that objects do not use "natural alignment" - for example, int and
4636 * long are only aligned at 2-byte boundaries. Therefore the assert()
4637 * won't work; also, tests have shown that skipping the "optimised
4638 * version" will even speed up m68k.
4639 */
4640#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004641#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004642 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4643 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004644 /* Fast path, see in STRINGLIB(utf8_decode) for
4645 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004646 /* Help allocation */
4647 const char *_p = p;
4648 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004649 while (_p < aligned_end) {
4650 unsigned long value = *(const unsigned long *) _p;
4651 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004652 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004653 *((unsigned long *)q) = value;
4654 _p += SIZEOF_LONG;
4655 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004656 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004657 p = _p;
4658 while (p < end) {
4659 if ((unsigned char)*p & 0x80)
4660 break;
4661 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004662 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004663 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004664 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004665#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004666#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004667 while (p < end) {
4668 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4669 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004670 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004671 /* Help allocation */
4672 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004673 while (_p < aligned_end) {
4674 unsigned long value = *(unsigned long *) _p;
4675 if (value & ASCII_CHAR_MASK)
4676 break;
4677 _p += SIZEOF_LONG;
4678 }
4679 p = _p;
4680 if (_p == end)
4681 break;
4682 }
4683 if ((unsigned char)*p & 0x80)
4684 break;
4685 ++p;
4686 }
4687 memcpy(dest, start, p - start);
4688 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004689}
Antoine Pitrouab868312009-01-10 15:40:25 +00004690
Victor Stinner785938e2011-12-11 20:09:03 +01004691PyObject *
4692PyUnicode_DecodeUTF8Stateful(const char *s,
4693 Py_ssize_t size,
4694 const char *errors,
4695 Py_ssize_t *consumed)
4696{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004697 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004698 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004699 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004700
4701 Py_ssize_t startinpos;
4702 Py_ssize_t endinpos;
4703 const char *errmsg = "";
4704 PyObject *errorHandler = NULL;
4705 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004706
4707 if (size == 0) {
4708 if (consumed)
4709 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004710 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004711 }
4712
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004713 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4714 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004715 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004716 *consumed = 1;
4717 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004718 }
4719
Victor Stinner8f674cc2013-04-17 23:02:17 +02004720 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004721 writer.min_length = size;
4722 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004723 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004724
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004725 writer.pos = ascii_decode(s, end, writer.data);
4726 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004727 while (s < end) {
4728 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004729 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004730 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004731 if (PyUnicode_IS_ASCII(writer.buffer))
4732 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004733 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004734 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004735 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004736 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004737 } else {
4738 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004739 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004740 }
4741
4742 switch (ch) {
4743 case 0:
4744 if (s == end || consumed)
4745 goto End;
4746 errmsg = "unexpected end of data";
4747 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004748 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004749 break;
4750 case 1:
4751 errmsg = "invalid start byte";
4752 startinpos = s - starts;
4753 endinpos = startinpos + 1;
4754 break;
4755 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004756 case 3:
4757 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004758 errmsg = "invalid continuation byte";
4759 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004760 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004761 break;
4762 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004763 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004764 goto onError;
4765 continue;
4766 }
4767
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004768 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004769 errors, &errorHandler,
4770 "utf-8", errmsg,
4771 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004772 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004773 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004774 }
4775
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004776End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004777 if (consumed)
4778 *consumed = s - starts;
4779
4780 Py_XDECREF(errorHandler);
4781 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004782 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004783
4784onError:
4785 Py_XDECREF(errorHandler);
4786 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004787 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004788 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004789}
4790
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004791#ifdef __APPLE__
4792
4793/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004794 used to decode the command line arguments on Mac OS X.
4795
4796 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004797 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004798
4799wchar_t*
4800_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4801{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004802 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004803 wchar_t *unicode;
4804 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004805
4806 /* Note: size will always be longer than the resulting Unicode
4807 character count */
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004808 if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004809 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004810 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004811 if (!unicode)
4812 return NULL;
4813
4814 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004815 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004816 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004817 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004818 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004819#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004820 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004821#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004822 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004823#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004824 if (ch > 0xFF) {
4825#if SIZEOF_WCHAR_T == 4
4826 assert(0);
4827#else
4828 assert(Py_UNICODE_IS_SURROGATE(ch));
4829 /* compute and append the two surrogates: */
4830 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4831 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4832#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004833 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004834 else {
4835 if (!ch && s == e)
4836 break;
4837 /* surrogateescape */
4838 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4839 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004840 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004841 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004842 return unicode;
4843}
4844
4845#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004846
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004847/* Primary internal function which creates utf8 encoded bytes objects.
4848
4849 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004850 and allocate exactly as much space needed at the end. Else allocate the
4851 maximum possible needed (4 result bytes per Unicode character), and return
4852 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004853*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004854PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004855_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856{
Victor Stinner6099a032011-12-18 14:22:26 +01004857 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004858 void *data;
4859 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004860
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004861 if (!PyUnicode_Check(unicode)) {
4862 PyErr_BadArgument();
4863 return NULL;
4864 }
4865
4866 if (PyUnicode_READY(unicode) == -1)
4867 return NULL;
4868
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004869 if (PyUnicode_UTF8(unicode))
4870 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4871 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004872
4873 kind = PyUnicode_KIND(unicode);
4874 data = PyUnicode_DATA(unicode);
4875 size = PyUnicode_GET_LENGTH(unicode);
4876
Benjamin Petersonead6b532011-12-20 17:23:42 -06004877 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004878 default:
4879 assert(0);
4880 case PyUnicode_1BYTE_KIND:
4881 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4882 assert(!PyUnicode_IS_ASCII(unicode));
4883 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4884 case PyUnicode_2BYTE_KIND:
4885 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4886 case PyUnicode_4BYTE_KIND:
4887 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004888 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004889}
4890
Alexander Belopolsky40018472011-02-26 01:02:56 +00004891PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004892PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4893 Py_ssize_t size,
4894 const char *errors)
4895{
4896 PyObject *v, *unicode;
4897
4898 unicode = PyUnicode_FromUnicode(s, size);
4899 if (unicode == NULL)
4900 return NULL;
4901 v = _PyUnicode_AsUTF8String(unicode, errors);
4902 Py_DECREF(unicode);
4903 return v;
4904}
4905
4906PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004907PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004909 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004910}
4911
Walter Dörwald41980ca2007-08-16 21:55:45 +00004912/* --- UTF-32 Codec ------------------------------------------------------- */
4913
4914PyObject *
4915PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004916 Py_ssize_t size,
4917 const char *errors,
4918 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004919{
4920 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4921}
4922
4923PyObject *
4924PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004925 Py_ssize_t size,
4926 const char *errors,
4927 int *byteorder,
4928 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004929{
4930 const char *starts = s;
4931 Py_ssize_t startinpos;
4932 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004933 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004934 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004935 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004936 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004937 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004938 PyObject *errorHandler = NULL;
4939 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004940
Walter Dörwald41980ca2007-08-16 21:55:45 +00004941 q = (unsigned char *)s;
4942 e = q + size;
4943
4944 if (byteorder)
4945 bo = *byteorder;
4946
4947 /* Check for BOM marks (U+FEFF) in the input and adjust current
4948 byte order setting accordingly. In native mode, the leading BOM
4949 mark is skipped, in all other modes, it is copied to the output
4950 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004951 if (bo == 0 && size >= 4) {
4952 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4953 if (bom == 0x0000FEFF) {
4954 bo = -1;
4955 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004956 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004957 else if (bom == 0xFFFE0000) {
4958 bo = 1;
4959 q += 4;
4960 }
4961 if (byteorder)
4962 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004963 }
4964
Victor Stinnere64322e2012-10-30 23:12:47 +01004965 if (q == e) {
4966 if (consumed)
4967 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004968 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004969 }
4970
Victor Stinnere64322e2012-10-30 23:12:47 +01004971#ifdef WORDS_BIGENDIAN
4972 le = bo < 0;
4973#else
4974 le = bo <= 0;
4975#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004976 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01004977
Victor Stinner8f674cc2013-04-17 23:02:17 +02004978 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004979 writer.min_length = (e - q + 3) / 4;
4980 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004981 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01004982
Victor Stinnere64322e2012-10-30 23:12:47 +01004983 while (1) {
4984 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004985 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004986
Victor Stinnere64322e2012-10-30 23:12:47 +01004987 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004988 enum PyUnicode_Kind kind = writer.kind;
4989 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01004990 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004991 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004992 if (le) {
4993 do {
4994 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4995 if (ch > maxch)
4996 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004997 if (kind != PyUnicode_1BYTE_KIND &&
4998 Py_UNICODE_IS_SURROGATE(ch))
4999 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005000 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005001 q += 4;
5002 } while (q <= last);
5003 }
5004 else {
5005 do {
5006 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5007 if (ch > maxch)
5008 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005009 if (kind != PyUnicode_1BYTE_KIND &&
5010 Py_UNICODE_IS_SURROGATE(ch))
5011 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005012 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005013 q += 4;
5014 } while (q <= last);
5015 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005016 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005017 }
5018
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005019 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005020 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005021 startinpos = ((const char *)q) - starts;
5022 endinpos = startinpos + 4;
5023 }
5024 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005025 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005026 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005027 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005028 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005029 startinpos = ((const char *)q) - starts;
5030 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005031 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005032 else {
5033 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005034 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005035 goto onError;
5036 q += 4;
5037 continue;
5038 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005039 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005040 startinpos = ((const char *)q) - starts;
5041 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005042 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005043
5044 /* The remaining input chars are ignored if the callback
5045 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005046 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005047 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005048 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005049 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005050 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005051 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005052 }
5053
Walter Dörwald41980ca2007-08-16 21:55:45 +00005054 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005055 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005056
Walter Dörwald41980ca2007-08-16 21:55:45 +00005057 Py_XDECREF(errorHandler);
5058 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005059 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005060
Benjamin Peterson29060642009-01-31 22:14:21 +00005061 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005062 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005063 Py_XDECREF(errorHandler);
5064 Py_XDECREF(exc);
5065 return NULL;
5066}
5067
5068PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005069_PyUnicode_EncodeUTF32(PyObject *str,
5070 const char *errors,
5071 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005072{
Serhiy Storchaka30793282014-01-04 22:44:01 +02005073 int kind;
5074 void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005075 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005076 PyObject *v;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005077 unsigned char *p;
5078 Py_ssize_t nsize, i;
5079 /* Offsets from p for storing byte pairs in the right order. */
Christian Heimes743e0cd2012-10-17 23:52:17 +02005080#if PY_LITTLE_ENDIAN
Serhiy Storchaka30793282014-01-04 22:44:01 +02005081 int iorder[] = {0, 1, 2, 3};
Walter Dörwald41980ca2007-08-16 21:55:45 +00005082#else
Serhiy Storchaka30793282014-01-04 22:44:01 +02005083 int iorder[] = {3, 2, 1, 0};
Walter Dörwald41980ca2007-08-16 21:55:45 +00005084#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005085 const char *encoding;
5086 PyObject *errorHandler = NULL;
5087 PyObject *exc = NULL;
5088 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005089
Serhiy Storchaka30793282014-01-04 22:44:01 +02005090#define STORECHAR(CH) \
5091 do { \
5092 p[iorder[3]] = ((CH) >> 24) & 0xff; \
5093 p[iorder[2]] = ((CH) >> 16) & 0xff; \
5094 p[iorder[1]] = ((CH) >> 8) & 0xff; \
5095 p[iorder[0]] = (CH) & 0xff; \
5096 p += 4; \
5097 } while(0)
5098
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005099 if (!PyUnicode_Check(str)) {
5100 PyErr_BadArgument();
5101 return NULL;
5102 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005103 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005104 return NULL;
5105 kind = PyUnicode_KIND(str);
5106 data = PyUnicode_DATA(str);
5107 len = PyUnicode_GET_LENGTH(str);
5108
Serhiy Storchaka583a9392014-01-04 19:25:37 +02005109 nsize = len + (byteorder == 0);
Serhiy Storchaka30793282014-01-04 22:44:01 +02005110 if (nsize > PY_SSIZE_T_MAX / 4)
5111 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005112 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005113 if (v == NULL)
5114 return NULL;
5115
Serhiy Storchaka30793282014-01-04 22:44:01 +02005116 p = (unsigned char *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005117 if (byteorder == 0)
Serhiy Storchaka30793282014-01-04 22:44:01 +02005118 STORECHAR(0xFEFF);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005119 if (len == 0)
Serhiy Storchaka30793282014-01-04 22:44:01 +02005120 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005121
Serhiy Storchaka30793282014-01-04 22:44:01 +02005122 if (byteorder == -1) {
5123 /* force LE */
5124 iorder[0] = 0;
5125 iorder[1] = 1;
5126 iorder[2] = 2;
5127 iorder[3] = 3;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005128 encoding = "utf-32-le";
Serhiy Storchaka30793282014-01-04 22:44:01 +02005129 }
5130 else if (byteorder == 1) {
5131 /* force BE */
5132 iorder[0] = 3;
5133 iorder[1] = 2;
5134 iorder[2] = 1;
5135 iorder[3] = 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005136 encoding = "utf-32-be";
Serhiy Storchaka30793282014-01-04 22:44:01 +02005137 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005138 else
5139 encoding = "utf-32";
5140
5141 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005142 for (i = 0; i < len; i++)
5143 STORECHAR(PyUnicode_READ(kind, data, i));
5144 return v;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005145 }
5146
Serhiy Storchaka30793282014-01-04 22:44:01 +02005147 for (i = 0; i < len;) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005148 Py_ssize_t repsize, moreunits;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005149 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5150 i++;
5151 assert(ch <= MAX_UNICODE);
5152 if (!Py_UNICODE_IS_SURROGATE(ch)) {
5153 STORECHAR(ch);
5154 continue;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005155 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005156
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005157 rep = unicode_encode_call_errorhandler(
5158 errors, &errorHandler,
5159 encoding, "surrogates not allowed",
Serhiy Storchaka30793282014-01-04 22:44:01 +02005160 str, &exc, i-1, i, &i);
5161
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005162 if (!rep)
5163 goto error;
5164
5165 if (PyBytes_Check(rep)) {
5166 repsize = PyBytes_GET_SIZE(rep);
5167 if (repsize & 3) {
5168 raise_encode_exception(&exc, encoding,
Serhiy Storchaka30793282014-01-04 22:44:01 +02005169 str, i - 1, i,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005170 "surrogates not allowed");
5171 goto error;
5172 }
5173 moreunits = repsize / 4;
5174 }
5175 else {
5176 assert(PyUnicode_Check(rep));
5177 if (PyUnicode_READY(rep) < 0)
5178 goto error;
5179 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5180 if (!PyUnicode_IS_ASCII(rep)) {
5181 raise_encode_exception(&exc, encoding,
Serhiy Storchaka30793282014-01-04 22:44:01 +02005182 str, i - 1, i,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005183 "surrogates not allowed");
5184 goto error;
5185 }
5186 }
5187
5188 /* four bytes are reserved for each surrogate */
5189 if (moreunits > 1) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005190 Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005191 Py_ssize_t morebytes = 4 * (moreunits - 1);
5192 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5193 /* integer overflow */
5194 PyErr_NoMemory();
5195 goto error;
5196 }
5197 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5198 goto error;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005199 p = (unsigned char*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005200 }
5201
5202 if (PyBytes_Check(rep)) {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005203 Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize);
5204 p += repsize;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005205 } else /* rep is unicode */ {
Serhiy Storchaka30793282014-01-04 22:44:01 +02005206 const Py_UCS1 *repdata;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005207 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka30793282014-01-04 22:44:01 +02005208 repdata = PyUnicode_1BYTE_DATA(rep);
5209 while (repsize--) {
5210 Py_UCS4 ch = *repdata++;
5211 STORECHAR(ch);
5212 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005213 }
5214
5215 Py_CLEAR(rep);
5216 }
5217
5218 /* Cut back to size actually needed. This is necessary for, for example,
5219 encoding of a string containing isolated surrogates and the 'ignore'
5220 handler is used. */
Serhiy Storchaka30793282014-01-04 22:44:01 +02005221 nsize = p - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005222 if (nsize != PyBytes_GET_SIZE(v))
5223 _PyBytes_Resize(&v, nsize);
5224 Py_XDECREF(errorHandler);
5225 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005226 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005227 error:
5228 Py_XDECREF(rep);
5229 Py_XDECREF(errorHandler);
5230 Py_XDECREF(exc);
5231 Py_XDECREF(v);
5232 return NULL;
Serhiy Storchaka30793282014-01-04 22:44:01 +02005233#undef STORECHAR
Walter Dörwald41980ca2007-08-16 21:55:45 +00005234}
5235
Alexander Belopolsky40018472011-02-26 01:02:56 +00005236PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005237PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5238 Py_ssize_t size,
5239 const char *errors,
5240 int byteorder)
5241{
5242 PyObject *result;
5243 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5244 if (tmp == NULL)
5245 return NULL;
5246 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5247 Py_DECREF(tmp);
5248 return result;
5249}
5250
5251PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005252PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005253{
Victor Stinnerb960b342011-11-20 19:12:52 +01005254 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005255}
5256
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257/* --- UTF-16 Codec ------------------------------------------------------- */
5258
Tim Peters772747b2001-08-09 22:21:55 +00005259PyObject *
5260PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005261 Py_ssize_t size,
5262 const char *errors,
5263 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264{
Walter Dörwald69652032004-09-07 20:24:22 +00005265 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5266}
5267
5268PyObject *
5269PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005270 Py_ssize_t size,
5271 const char *errors,
5272 int *byteorder,
5273 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005274{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005275 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005276 Py_ssize_t startinpos;
5277 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005278 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005279 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005280 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005281 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005282 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005283 PyObject *errorHandler = NULL;
5284 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005285 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286
Tim Peters772747b2001-08-09 22:21:55 +00005287 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005288 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289
5290 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005291 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005293 /* Check for BOM marks (U+FEFF) in the input and adjust current
5294 byte order setting accordingly. In native mode, the leading BOM
5295 mark is skipped, in all other modes, it is copied to the output
5296 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005297 if (bo == 0 && size >= 2) {
5298 const Py_UCS4 bom = (q[1] << 8) | q[0];
5299 if (bom == 0xFEFF) {
5300 q += 2;
5301 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005302 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005303 else if (bom == 0xFFFE) {
5304 q += 2;
5305 bo = 1;
5306 }
5307 if (byteorder)
5308 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005309 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310
Antoine Pitrou63065d72012-05-15 23:48:04 +02005311 if (q == e) {
5312 if (consumed)
5313 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005314 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005315 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005316
Christian Heimes743e0cd2012-10-17 23:52:17 +02005317#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005318 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005319 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005320#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005321 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005322 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005323#endif
Tim Peters772747b2001-08-09 22:21:55 +00005324
Antoine Pitrou63065d72012-05-15 23:48:04 +02005325 /* Note: size will always be longer than the resulting Unicode
5326 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005327 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005328 writer.min_length = (e - q + 1) / 2;
5329 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005330 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005331
Antoine Pitrou63065d72012-05-15 23:48:04 +02005332 while (1) {
5333 Py_UCS4 ch = 0;
5334 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005335 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005336 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005337 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005338 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005339 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005340 native_ordering);
5341 else
5342 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005343 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005344 native_ordering);
5345 } else if (kind == PyUnicode_2BYTE_KIND) {
5346 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005347 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005348 native_ordering);
5349 } else {
5350 assert(kind == PyUnicode_4BYTE_KIND);
5351 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005352 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005353 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005354 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005355 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005356
Antoine Pitrou63065d72012-05-15 23:48:04 +02005357 switch (ch)
5358 {
5359 case 0:
5360 /* remaining byte at the end? (size should be even) */
5361 if (q == e || consumed)
5362 goto End;
5363 errmsg = "truncated data";
5364 startinpos = ((const char *)q) - starts;
5365 endinpos = ((const char *)e) - starts;
5366 break;
5367 /* The remaining input chars are ignored if the callback
5368 chooses to skip the input */
5369 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005370 q -= 2;
5371 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005372 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005373 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005374 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005375 endinpos = ((const char *)e) - starts;
5376 break;
5377 case 2:
5378 errmsg = "illegal encoding";
5379 startinpos = ((const char *)q) - 2 - starts;
5380 endinpos = startinpos + 2;
5381 break;
5382 case 3:
5383 errmsg = "illegal UTF-16 surrogate";
5384 startinpos = ((const char *)q) - 4 - starts;
5385 endinpos = startinpos + 2;
5386 break;
5387 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005388 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005389 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005390 continue;
5391 }
5392
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005393 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005394 errors,
5395 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005396 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005397 &starts,
5398 (const char **)&e,
5399 &startinpos,
5400 &endinpos,
5401 &exc,
5402 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005403 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005404 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405 }
5406
Antoine Pitrou63065d72012-05-15 23:48:04 +02005407End:
Walter Dörwald69652032004-09-07 20:24:22 +00005408 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005409 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005410
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005411 Py_XDECREF(errorHandler);
5412 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005413 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005414
Benjamin Peterson29060642009-01-31 22:14:21 +00005415 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005416 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005417 Py_XDECREF(errorHandler);
5418 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419 return NULL;
5420}
5421
Tim Peters772747b2001-08-09 22:21:55 +00005422PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005423_PyUnicode_EncodeUTF16(PyObject *str,
5424 const char *errors,
5425 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005427 enum PyUnicode_Kind kind;
5428 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005429 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005430 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005431 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005432 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005433#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005434 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005435#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005436 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005437#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005438 const char *encoding;
5439 Py_ssize_t nsize, pos;
5440 PyObject *errorHandler = NULL;
5441 PyObject *exc = NULL;
5442 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005443
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005444 if (!PyUnicode_Check(str)) {
5445 PyErr_BadArgument();
5446 return NULL;
5447 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005448 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005449 return NULL;
5450 kind = PyUnicode_KIND(str);
5451 data = PyUnicode_DATA(str);
5452 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005453
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005454 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005455 if (kind == PyUnicode_4BYTE_KIND) {
5456 const Py_UCS4 *in = (const Py_UCS4 *)data;
5457 const Py_UCS4 *end = in + len;
5458 while (in < end)
5459 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005460 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005461 }
5462 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005463 return PyErr_NoMemory();
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005464 nsize = len + pairs + (byteorder == 0);
5465 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466 if (v == NULL)
5467 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005468
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005469 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005470 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005471 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005473 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005474 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005475 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005476
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005477 if (kind == PyUnicode_1BYTE_KIND) {
5478 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5479 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005480 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005481
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005482 if (byteorder < 0)
5483 encoding = "utf-16-le";
5484 else if (byteorder > 0)
5485 encoding = "utf-16-be";
5486 else
5487 encoding = "utf-16";
5488
5489 pos = 0;
5490 while (pos < len) {
5491 Py_ssize_t repsize, moreunits;
5492
5493 if (kind == PyUnicode_2BYTE_KIND) {
5494 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5495 &out, native_ordering);
5496 }
5497 else {
5498 assert(kind == PyUnicode_4BYTE_KIND);
5499 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5500 &out, native_ordering);
5501 }
5502 if (pos == len)
5503 break;
5504
5505 rep = unicode_encode_call_errorhandler(
5506 errors, &errorHandler,
5507 encoding, "surrogates not allowed",
5508 str, &exc, pos, pos + 1, &pos);
5509 if (!rep)
5510 goto error;
5511
5512 if (PyBytes_Check(rep)) {
5513 repsize = PyBytes_GET_SIZE(rep);
5514 if (repsize & 1) {
5515 raise_encode_exception(&exc, encoding,
5516 str, pos - 1, pos,
5517 "surrogates not allowed");
5518 goto error;
5519 }
5520 moreunits = repsize / 2;
5521 }
5522 else {
5523 assert(PyUnicode_Check(rep));
5524 if (PyUnicode_READY(rep) < 0)
5525 goto error;
5526 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5527 if (!PyUnicode_IS_ASCII(rep)) {
5528 raise_encode_exception(&exc, encoding,
5529 str, pos - 1, pos,
5530 "surrogates not allowed");
5531 goto error;
5532 }
5533 }
5534
5535 /* two bytes are reserved for each surrogate */
5536 if (moreunits > 1) {
5537 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5538 Py_ssize_t morebytes = 2 * (moreunits - 1);
5539 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5540 /* integer overflow */
5541 PyErr_NoMemory();
5542 goto error;
5543 }
5544 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5545 goto error;
5546 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5547 }
5548
5549 if (PyBytes_Check(rep)) {
5550 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5551 out += moreunits;
5552 } else /* rep is unicode */ {
5553 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5554 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5555 &out, native_ordering);
5556 }
5557
5558 Py_CLEAR(rep);
5559 }
5560
5561 /* Cut back to size actually needed. This is necessary for, for example,
5562 encoding of a string containing isolated surrogates and the 'ignore' handler
5563 is used. */
5564 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5565 if (nsize != PyBytes_GET_SIZE(v))
5566 _PyBytes_Resize(&v, nsize);
5567 Py_XDECREF(errorHandler);
5568 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005569 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005570 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005571 error:
5572 Py_XDECREF(rep);
5573 Py_XDECREF(errorHandler);
5574 Py_XDECREF(exc);
5575 Py_XDECREF(v);
5576 return NULL;
5577#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578}
5579
Alexander Belopolsky40018472011-02-26 01:02:56 +00005580PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005581PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5582 Py_ssize_t size,
5583 const char *errors,
5584 int byteorder)
5585{
5586 PyObject *result;
5587 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5588 if (tmp == NULL)
5589 return NULL;
5590 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5591 Py_DECREF(tmp);
5592 return result;
5593}
5594
5595PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005596PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005598 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599}
5600
5601/* --- Unicode Escape Codec ----------------------------------------------- */
5602
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005603/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5604 if all the escapes in the string make it still a valid ASCII string.
5605 Returns -1 if any escapes were found which cause the string to
5606 pop out of ASCII range. Otherwise returns the length of the
5607 required buffer to hold the string.
5608 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005609static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005610length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5611{
5612 const unsigned char *p = (const unsigned char *)s;
5613 const unsigned char *end = p + size;
5614 Py_ssize_t length = 0;
5615
5616 if (size < 0)
5617 return -1;
5618
5619 for (; p < end; ++p) {
5620 if (*p > 127) {
5621 /* Non-ASCII */
5622 return -1;
5623 }
5624 else if (*p != '\\') {
5625 /* Normal character */
5626 ++length;
5627 }
5628 else {
5629 /* Backslash-escape, check next char */
5630 ++p;
5631 /* Escape sequence reaches till end of string or
5632 non-ASCII follow-up. */
5633 if (p >= end || *p > 127)
5634 return -1;
5635 switch (*p) {
5636 case '\n':
5637 /* backslash + \n result in zero characters */
5638 break;
5639 case '\\': case '\'': case '\"':
5640 case 'b': case 'f': case 't':
5641 case 'n': case 'r': case 'v': case 'a':
5642 ++length;
5643 break;
5644 case '0': case '1': case '2': case '3':
5645 case '4': case '5': case '6': case '7':
5646 case 'x': case 'u': case 'U': case 'N':
5647 /* these do not guarantee ASCII characters */
5648 return -1;
5649 default:
5650 /* count the backslash + the other character */
5651 length += 2;
5652 }
5653 }
5654 }
5655 return length;
5656}
5657
Fredrik Lundh06d12682001-01-24 07:59:11 +00005658static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005659
Alexander Belopolsky40018472011-02-26 01:02:56 +00005660PyObject *
5661PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005662 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005663 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005664{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005665 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005666 Py_ssize_t startinpos;
5667 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005668 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005670 char* message;
5671 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005672 PyObject *errorHandler = NULL;
5673 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005674 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005675
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005676 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005677 if (len == 0)
5678 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005679
5680 /* After length_of_escaped_ascii_string() there are two alternatives,
5681 either the string is pure ASCII with named escapes like \n, etc.
5682 and we determined it's exact size (common case)
5683 or it contains \x, \u, ... escape sequences. then we create a
5684 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005685 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005686 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005687 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005688 }
5689 else {
5690 /* Escaped strings will always be longer than the resulting
5691 Unicode string, so we start with size here and then reduce the
5692 length after conversion to the true value.
5693 (but if the error callback returns a long replacement string
5694 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005695 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005696 }
5697
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005699 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005701
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702 while (s < end) {
5703 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005704 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005705 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706
5707 /* Non-escape characters are interpreted as Unicode ordinals */
5708 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005709 x = (unsigned char)*s;
5710 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005711 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005712 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713 continue;
5714 }
5715
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005716 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717 /* \ - Escapes */
5718 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005719 c = *s++;
5720 if (s > end)
5721 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005722
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005723 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724
Benjamin Peterson29060642009-01-31 22:14:21 +00005725 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005726#define WRITECHAR(ch) \
5727 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005728 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005729 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005730 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005731
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005733 case '\\': WRITECHAR('\\'); break;
5734 case '\'': WRITECHAR('\''); break;
5735 case '\"': WRITECHAR('\"'); break;
5736 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005737 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005738 case 'f': WRITECHAR('\014'); break;
5739 case 't': WRITECHAR('\t'); break;
5740 case 'n': WRITECHAR('\n'); break;
5741 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005742 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005743 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005744 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005745 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746
Benjamin Peterson29060642009-01-31 22:14:21 +00005747 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748 case '0': case '1': case '2': case '3':
5749 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005750 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005751 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005752 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005753 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005754 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005756 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757 break;
5758
Benjamin Peterson29060642009-01-31 22:14:21 +00005759 /* hex escapes */
5760 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005762 digits = 2;
5763 message = "truncated \\xXX escape";
5764 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765
Benjamin Peterson29060642009-01-31 22:14:21 +00005766 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005768 digits = 4;
5769 message = "truncated \\uXXXX escape";
5770 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005771
Benjamin Peterson29060642009-01-31 22:14:21 +00005772 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005773 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005774 digits = 8;
5775 message = "truncated \\UXXXXXXXX escape";
5776 hexescape:
5777 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005778 if (end - s < digits) {
5779 /* count only hex digits */
5780 for (; s < end; ++s) {
5781 c = (unsigned char)*s;
5782 if (!Py_ISXDIGIT(c))
5783 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005784 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005785 goto error;
5786 }
5787 for (; digits--; ++s) {
5788 c = (unsigned char)*s;
5789 if (!Py_ISXDIGIT(c))
5790 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005791 chr = (chr<<4) & ~0xF;
5792 if (c >= '0' && c <= '9')
5793 chr += c - '0';
5794 else if (c >= 'a' && c <= 'f')
5795 chr += 10 + c - 'a';
5796 else
5797 chr += 10 + c - 'A';
5798 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005799 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005800 /* _decoding_error will have already written into the
5801 target buffer. */
5802 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005803 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005804 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005805 message = "illegal Unicode character";
5806 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005807 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005808 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005809 break;
5810
Benjamin Peterson29060642009-01-31 22:14:21 +00005811 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005812 case 'N':
5813 message = "malformed \\N character escape";
5814 if (ucnhash_CAPI == NULL) {
5815 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005816 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5817 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005818 if (ucnhash_CAPI == NULL)
5819 goto ucnhashError;
5820 }
5821 if (*s == '{') {
5822 const char *start = s+1;
5823 /* look for the closing brace */
5824 while (*s != '}' && s < end)
5825 s++;
5826 if (s > start && s < end && *s == '}') {
5827 /* found a name. look it up in the unicode database */
5828 message = "unknown Unicode character name";
5829 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005830 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005831 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005832 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005833 goto store;
5834 }
5835 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005836 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005837
5838 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005839 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005840 message = "\\ at end of string";
5841 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005842 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005843 }
5844 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005845 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005846 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005847 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005848 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005849 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005850 continue;
5851
5852 error:
5853 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005854 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005855 errors, &errorHandler,
5856 "unicodeescape", message,
5857 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005858 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005859 goto onError;
5860 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005862#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005863
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005864 Py_XDECREF(errorHandler);
5865 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005866 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005867
Benjamin Peterson29060642009-01-31 22:14:21 +00005868 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005869 PyErr_SetString(
5870 PyExc_UnicodeError,
5871 "\\N escapes not supported (can't load unicodedata module)"
5872 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005873 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005874 Py_XDECREF(errorHandler);
5875 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005876 return NULL;
5877
Benjamin Peterson29060642009-01-31 22:14:21 +00005878 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005879 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005880 Py_XDECREF(errorHandler);
5881 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005882 return NULL;
5883}
5884
5885/* Return a Unicode-Escape string version of the Unicode object.
5886
5887 If quotes is true, the string is enclosed in u"" or u'' quotes as
5888 appropriate.
5889
5890*/
5891
Alexander Belopolsky40018472011-02-26 01:02:56 +00005892PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005893PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005894{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005895 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005896 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005898 int kind;
5899 void *data;
5900 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005901
Ezio Melottie7f90372012-10-05 03:33:31 +03005902 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005903 escape.
5904
Ezio Melottie7f90372012-10-05 03:33:31 +03005905 For UCS1 strings it's '\xxx', 4 bytes per source character.
5906 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5907 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005908 */
5909
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005910 if (!PyUnicode_Check(unicode)) {
5911 PyErr_BadArgument();
5912 return NULL;
5913 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005914 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005915 return NULL;
5916 len = PyUnicode_GET_LENGTH(unicode);
5917 kind = PyUnicode_KIND(unicode);
5918 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005919 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005920 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5921 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5922 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5923 }
5924
5925 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005926 return PyBytes_FromStringAndSize(NULL, 0);
5927
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005928 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005929 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005930
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005931 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005932 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005933 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005934 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935 if (repr == NULL)
5936 return NULL;
5937
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005938 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005939
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005940 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005941 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005942
Walter Dörwald79e913e2007-05-12 11:08:06 +00005943 /* Escape backslashes */
5944 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005945 *p++ = '\\';
5946 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005947 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005948 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005949
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005950 /* Map 21-bit characters to '\U00xxxxxx' */
5951 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005952 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005953 *p++ = '\\';
5954 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005955 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5956 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5957 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5958 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5959 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5960 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5961 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5962 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005963 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005964 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005965
Guido van Rossumd57fd912000-03-10 22:53:23 +00005966 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005967 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005968 *p++ = '\\';
5969 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005970 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5971 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5972 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5973 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005975
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005976 /* Map special whitespace to '\t', \n', '\r' */
5977 else if (ch == '\t') {
5978 *p++ = '\\';
5979 *p++ = 't';
5980 }
5981 else if (ch == '\n') {
5982 *p++ = '\\';
5983 *p++ = 'n';
5984 }
5985 else if (ch == '\r') {
5986 *p++ = '\\';
5987 *p++ = 'r';
5988 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005989
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005990 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005991 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005992 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005993 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005994 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5995 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005996 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005997
Guido van Rossumd57fd912000-03-10 22:53:23 +00005998 /* Copy everything else as-is */
5999 else
6000 *p++ = (char) ch;
6001 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006003 assert(p - PyBytes_AS_STRING(repr) > 0);
6004 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6005 return NULL;
6006 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007}
6008
Alexander Belopolsky40018472011-02-26 01:02:56 +00006009PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006010PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6011 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006012{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006013 PyObject *result;
6014 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6015 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006016 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006017 result = PyUnicode_AsUnicodeEscapeString(tmp);
6018 Py_DECREF(tmp);
6019 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020}
6021
6022/* --- Raw Unicode Escape Codec ------------------------------------------- */
6023
Alexander Belopolsky40018472011-02-26 01:02:56 +00006024PyObject *
6025PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006026 Py_ssize_t size,
6027 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006029 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006030 Py_ssize_t startinpos;
6031 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006032 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006033 const char *end;
6034 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006035 PyObject *errorHandler = NULL;
6036 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006037
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006038 if (size == 0)
6039 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006040
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 /* Escaped strings will always be longer than the resulting
6042 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006043 length after conversion to the true value. (But decoding error
6044 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006045 _PyUnicodeWriter_Init(&writer);
6046 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006047
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048 end = s + size;
6049 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006050 unsigned char c;
6051 Py_UCS4 x;
6052 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006053 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054
Benjamin Peterson29060642009-01-31 22:14:21 +00006055 /* Non-escape characters are interpreted as Unicode ordinals */
6056 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006057 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006058 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006059 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006060 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006061 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006062 startinpos = s-starts;
6063
6064 /* \u-escapes are only interpreted iff the number of leading
6065 backslashes if odd */
6066 bs = s;
6067 for (;s < end;) {
6068 if (*s != '\\')
6069 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006070 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006071 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006072 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006073 }
6074 if (((s - bs) & 1) == 0 ||
6075 s >= end ||
6076 (*s != 'u' && *s != 'U')) {
6077 continue;
6078 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006079 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006080 count = *s=='u' ? 4 : 8;
6081 s++;
6082
6083 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 for (x = 0, i = 0; i < count; ++i, ++s) {
6085 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006086 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006087 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006088 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006089 errors, &errorHandler,
6090 "rawunicodeescape", "truncated \\uXXXX",
6091 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006092 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006093 goto onError;
6094 goto nextByte;
6095 }
6096 x = (x<<4) & ~0xF;
6097 if (c >= '0' && c <= '9')
6098 x += c - '0';
6099 else if (c >= 'a' && c <= 'f')
6100 x += 10 + c - 'a';
6101 else
6102 x += 10 + c - 'A';
6103 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006104 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006105 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006106 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006107 }
6108 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006109 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006110 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006111 errors, &errorHandler,
6112 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006113 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006114 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006115 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006116 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006117 nextByte:
6118 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006119 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006120 Py_XDECREF(errorHandler);
6121 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006122 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006123
Benjamin Peterson29060642009-01-31 22:14:21 +00006124 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006125 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006126 Py_XDECREF(errorHandler);
6127 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128 return NULL;
6129}
6130
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006131
Alexander Belopolsky40018472011-02-26 01:02:56 +00006132PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006133PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006135 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006136 char *p;
6137 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006138 Py_ssize_t expandsize, pos;
6139 int kind;
6140 void *data;
6141 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006143 if (!PyUnicode_Check(unicode)) {
6144 PyErr_BadArgument();
6145 return NULL;
6146 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006147 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006148 return NULL;
6149 kind = PyUnicode_KIND(unicode);
6150 data = PyUnicode_DATA(unicode);
6151 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006152 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6153 bytes, and 1 byte characters 4. */
6154 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006155
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006156 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006157 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006158
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006159 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 if (repr == NULL)
6161 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006162 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006163 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006165 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006166 for (pos = 0; pos < len; pos++) {
6167 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006168 /* Map 32-bit characters to '\Uxxxxxxxx' */
6169 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006170 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006171 *p++ = '\\';
6172 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006173 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6174 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6175 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6176 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6177 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6178 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6179 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6180 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006181 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006182 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006183 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184 *p++ = '\\';
6185 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006186 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6187 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6188 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6189 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006191 /* Copy everything else as-is */
6192 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193 *p++ = (char) ch;
6194 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006195
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006196 assert(p > q);
6197 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006198 return NULL;
6199 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200}
6201
Alexander Belopolsky40018472011-02-26 01:02:56 +00006202PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006203PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6204 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006205{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006206 PyObject *result;
6207 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6208 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006209 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006210 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6211 Py_DECREF(tmp);
6212 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213}
6214
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006215/* --- Unicode Internal Codec ------------------------------------------- */
6216
Alexander Belopolsky40018472011-02-26 01:02:56 +00006217PyObject *
6218_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006219 Py_ssize_t size,
6220 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006221{
6222 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006223 Py_ssize_t startinpos;
6224 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006225 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006226 const char *end;
6227 const char *reason;
6228 PyObject *errorHandler = NULL;
6229 PyObject *exc = NULL;
6230
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006231 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006232 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006233 1))
6234 return NULL;
6235
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006236 if (size == 0)
6237 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006238
Victor Stinner8f674cc2013-04-17 23:02:17 +02006239 _PyUnicodeWriter_Init(&writer);
6240 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6241 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006242 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006243 }
6244 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006245
Victor Stinner8f674cc2013-04-17 23:02:17 +02006246 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006247 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006248 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006249 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006250 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006251 endinpos = end-starts;
6252 reason = "truncated input";
6253 goto error;
6254 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006255 /* We copy the raw representation one byte at a time because the
6256 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006257 ((char *) &uch)[0] = s[0];
6258 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006259#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006260 ((char *) &uch)[2] = s[2];
6261 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006262#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006263 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006264#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006265 /* We have to sanity check the raw data, otherwise doom looms for
6266 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006267 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006268 endinpos = s - starts + Py_UNICODE_SIZE;
6269 reason = "illegal code point (> 0x10FFFF)";
6270 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006271 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006272#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006273 s += Py_UNICODE_SIZE;
6274#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006275 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006276 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006277 Py_UNICODE uch2;
6278 ((char *) &uch2)[0] = s[0];
6279 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006280 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006281 {
Victor Stinner551ac952011-11-29 22:58:13 +01006282 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006283 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006284 }
6285 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006286#endif
6287
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006288 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006289 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006290 continue;
6291
6292 error:
6293 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006294 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006295 errors, &errorHandler,
6296 "unicode_internal", reason,
6297 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006298 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006299 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006300 }
6301
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006302 Py_XDECREF(errorHandler);
6303 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006304 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006305
Benjamin Peterson29060642009-01-31 22:14:21 +00006306 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006307 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006308 Py_XDECREF(errorHandler);
6309 Py_XDECREF(exc);
6310 return NULL;
6311}
6312
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313/* --- Latin-1 Codec ------------------------------------------------------ */
6314
Alexander Belopolsky40018472011-02-26 01:02:56 +00006315PyObject *
6316PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006317 Py_ssize_t size,
6318 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006321 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006322}
6323
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006324/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006325static void
6326make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006327 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006328 PyObject *unicode,
6329 Py_ssize_t startpos, Py_ssize_t endpos,
6330 const char *reason)
6331{
6332 if (*exceptionObject == NULL) {
6333 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006334 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006335 encoding, unicode, startpos, endpos, reason);
6336 }
6337 else {
6338 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6339 goto onError;
6340 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6341 goto onError;
6342 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6343 goto onError;
6344 return;
6345 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006346 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006347 }
6348}
6349
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006350/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006351static void
6352raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006353 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006354 PyObject *unicode,
6355 Py_ssize_t startpos, Py_ssize_t endpos,
6356 const char *reason)
6357{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006358 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006359 encoding, unicode, startpos, endpos, reason);
6360 if (*exceptionObject != NULL)
6361 PyCodec_StrictErrors(*exceptionObject);
6362}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006363
6364/* error handling callback helper:
6365 build arguments, call the callback and check the arguments,
6366 put the result into newpos and return the replacement string, which
6367 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006368static PyObject *
6369unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006370 PyObject **errorHandler,
6371 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006372 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006373 Py_ssize_t startpos, Py_ssize_t endpos,
6374 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006375{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006376 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006377 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006378 PyObject *restuple;
6379 PyObject *resunicode;
6380
6381 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006382 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006383 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006384 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006385 }
6386
Benjamin Petersonbac79492012-01-14 13:34:47 -05006387 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006388 return NULL;
6389 len = PyUnicode_GET_LENGTH(unicode);
6390
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006391 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006392 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006393 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006394 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006395
6396 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006397 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006398 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006399 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006400 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006401 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006402 Py_DECREF(restuple);
6403 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006404 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006405 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006406 &resunicode, newpos)) {
6407 Py_DECREF(restuple);
6408 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006409 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006410 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6411 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6412 Py_DECREF(restuple);
6413 return NULL;
6414 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006415 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006416 *newpos = len + *newpos;
6417 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006418 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006419 Py_DECREF(restuple);
6420 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006421 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006422 Py_INCREF(resunicode);
6423 Py_DECREF(restuple);
6424 return resunicode;
6425}
6426
Alexander Belopolsky40018472011-02-26 01:02:56 +00006427static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006428unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006429 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006430 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006431{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006432 /* input state */
6433 Py_ssize_t pos=0, size;
6434 int kind;
6435 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006436 /* output object */
6437 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006438 /* pointer into the output */
6439 char *str;
6440 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006441 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006442 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6443 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006444 PyObject *errorHandler = NULL;
6445 PyObject *exc = NULL;
6446 /* the following variable is used for caching string comparisons
6447 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6448 int known_errorHandler = -1;
6449
Benjamin Petersonbac79492012-01-14 13:34:47 -05006450 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006451 return NULL;
6452 size = PyUnicode_GET_LENGTH(unicode);
6453 kind = PyUnicode_KIND(unicode);
6454 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006455 /* allocate enough for a simple encoding without
6456 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006457 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006458 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006459 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006460 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006461 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006462 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006463 ressize = size;
6464
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006465 while (pos < size) {
6466 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006467
Benjamin Peterson29060642009-01-31 22:14:21 +00006468 /* can we encode this? */
6469 if (c<limit) {
6470 /* no overflow check, because we know that the space is enough */
6471 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006472 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006473 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006474 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006475 Py_ssize_t requiredsize;
6476 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006477 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006478 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006479 Py_ssize_t collstart = pos;
6480 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 /* find all unecodable characters */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006482 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006483 ++collend;
6484 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6485 if (known_errorHandler==-1) {
6486 if ((errors==NULL) || (!strcmp(errors, "strict")))
6487 known_errorHandler = 1;
6488 else if (!strcmp(errors, "replace"))
6489 known_errorHandler = 2;
6490 else if (!strcmp(errors, "ignore"))
6491 known_errorHandler = 3;
6492 else if (!strcmp(errors, "xmlcharrefreplace"))
6493 known_errorHandler = 4;
6494 else
6495 known_errorHandler = 0;
6496 }
6497 switch (known_errorHandler) {
6498 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006499 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006500 goto onError;
6501 case 2: /* replace */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006502 while (collstart++ < collend)
Benjamin Peterson29060642009-01-31 22:14:21 +00006503 *str++ = '?'; /* fall through */
6504 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006505 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006506 break;
6507 case 4: /* xmlcharrefreplace */
6508 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006509 requiredsize = respos;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006510 /* determine replacement size */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006511 for (i = collstart; i < collend; ++i) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006512 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006513 Py_ssize_t incr;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006514 if (ch < 10)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006515 incr = 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006516 else if (ch < 100)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006517 incr = 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006518 else if (ch < 1000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006519 incr = 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006520 else if (ch < 10000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006521 incr = 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006522 else if (ch < 100000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006523 incr = 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006524 else if (ch < 1000000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006525 incr = 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006526 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006527 assert(ch <= MAX_UNICODE);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006528 incr = 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006529 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006530 if (requiredsize > PY_SSIZE_T_MAX - incr)
6531 goto overflow;
6532 requiredsize += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +00006533 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006534 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6535 goto overflow;
6536 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006537 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006538 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006539 requiredsize = 2*ressize;
6540 if (_PyBytes_Resize(&res, requiredsize))
6541 goto onError;
6542 str = PyBytes_AS_STRING(res) + respos;
6543 ressize = requiredsize;
6544 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006545 /* generate replacement */
6546 for (i = collstart; i < collend; ++i) {
6547 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006548 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006549 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006550 break;
6551 default:
6552 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006553 encoding, reason, unicode, &exc,
6554 collstart, collend, &newpos);
6555 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006556 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006557 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006558 if (PyBytes_Check(repunicode)) {
6559 /* Directly copy bytes result to output. */
6560 repsize = PyBytes_Size(repunicode);
6561 if (repsize > 1) {
6562 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006563 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006564 if (ressize > PY_SSIZE_T_MAX - repsize - 1) {
6565 Py_DECREF(repunicode);
6566 goto overflow;
6567 }
Martin v. Löwis011e8422009-05-05 04:43:17 +00006568 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6569 Py_DECREF(repunicode);
6570 goto onError;
6571 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006572 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006573 ressize += repsize-1;
6574 }
6575 memcpy(str, PyBytes_AsString(repunicode), repsize);
6576 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006577 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006578 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006579 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006580 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006581 /* need more space? (at least enough for what we
6582 have+the replacement+the rest of the string, so
6583 we won't have to check space for encodable characters) */
6584 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006585 repsize = PyUnicode_GET_LENGTH(repunicode);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006586 requiredsize = respos;
6587 if (requiredsize > PY_SSIZE_T_MAX - repsize)
6588 goto overflow;
6589 requiredsize += repsize;
6590 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6591 goto overflow;
6592 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006593 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006594 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006595 requiredsize = 2*ressize;
6596 if (_PyBytes_Resize(&res, requiredsize)) {
6597 Py_DECREF(repunicode);
6598 goto onError;
6599 }
6600 str = PyBytes_AS_STRING(res) + respos;
6601 ressize = requiredsize;
6602 }
6603 /* check if there is anything unencodable in the replacement
6604 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006605 for (i = 0; repsize-->0; ++i, ++str) {
6606 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006607 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006608 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006609 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006610 Py_DECREF(repunicode);
6611 goto onError;
6612 }
6613 *str = (char)c;
6614 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006615 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006616 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006617 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006618 }
6619 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006620 /* Resize if we allocated to much */
6621 size = str - PyBytes_AS_STRING(res);
6622 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006623 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006624 if (_PyBytes_Resize(&res, size) < 0)
6625 goto onError;
6626 }
6627
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006628 Py_XDECREF(errorHandler);
6629 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006630 return res;
6631
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006632 overflow:
6633 PyErr_SetString(PyExc_OverflowError,
6634 "encoded result is too long for a Python string");
6635
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006636 onError:
6637 Py_XDECREF(res);
6638 Py_XDECREF(errorHandler);
6639 Py_XDECREF(exc);
6640 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006641}
6642
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006643/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006644PyObject *
6645PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006646 Py_ssize_t size,
6647 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006649 PyObject *result;
6650 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6651 if (unicode == NULL)
6652 return NULL;
6653 result = unicode_encode_ucs1(unicode, errors, 256);
6654 Py_DECREF(unicode);
6655 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656}
6657
Alexander Belopolsky40018472011-02-26 01:02:56 +00006658PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006659_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660{
6661 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006662 PyErr_BadArgument();
6663 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006665 if (PyUnicode_READY(unicode) == -1)
6666 return NULL;
6667 /* Fast path: if it is a one-byte string, construct
6668 bytes object directly. */
6669 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6670 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6671 PyUnicode_GET_LENGTH(unicode));
6672 /* Non-Latin-1 characters present. Defer to above function to
6673 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006674 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006675}
6676
6677PyObject*
6678PyUnicode_AsLatin1String(PyObject *unicode)
6679{
6680 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006681}
6682
6683/* --- 7-bit ASCII Codec -------------------------------------------------- */
6684
Alexander Belopolsky40018472011-02-26 01:02:56 +00006685PyObject *
6686PyUnicode_DecodeASCII(const char *s,
6687 Py_ssize_t size,
6688 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006689{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006690 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006691 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006692 int kind;
6693 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006694 Py_ssize_t startinpos;
6695 Py_ssize_t endinpos;
6696 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006697 const char *e;
6698 PyObject *errorHandler = NULL;
6699 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006700
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006702 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006703
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006705 if (size == 1 && (unsigned char)s[0] < 128)
6706 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006707
Victor Stinner8f674cc2013-04-17 23:02:17 +02006708 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006709 writer.min_length = size;
6710 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006711 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006712
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006713 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006714 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006715 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006716 writer.pos = outpos;
6717 if (writer.pos == size)
6718 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006719
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006720 s += writer.pos;
6721 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006722 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006723 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006724 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006725 PyUnicode_WRITE(kind, data, writer.pos, c);
6726 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006727 ++s;
6728 }
6729 else {
6730 startinpos = s-starts;
6731 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006732 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006733 errors, &errorHandler,
6734 "ascii", "ordinal not in range(128)",
6735 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006736 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006737 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006738 kind = writer.kind;
6739 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006740 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006742 Py_XDECREF(errorHandler);
6743 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006744 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006745
Benjamin Peterson29060642009-01-31 22:14:21 +00006746 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006747 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006748 Py_XDECREF(errorHandler);
6749 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750 return NULL;
6751}
6752
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006753/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006754PyObject *
6755PyUnicode_EncodeASCII(const Py_UNICODE *p,
6756 Py_ssize_t size,
6757 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006759 PyObject *result;
6760 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6761 if (unicode == NULL)
6762 return NULL;
6763 result = unicode_encode_ucs1(unicode, errors, 128);
6764 Py_DECREF(unicode);
6765 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766}
6767
Alexander Belopolsky40018472011-02-26 01:02:56 +00006768PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006769_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770{
6771 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006772 PyErr_BadArgument();
6773 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006775 if (PyUnicode_READY(unicode) == -1)
6776 return NULL;
6777 /* Fast path: if it is an ASCII-only string, construct bytes object
6778 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006779 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006780 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6781 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006782 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006783}
6784
6785PyObject *
6786PyUnicode_AsASCIIString(PyObject *unicode)
6787{
6788 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789}
6790
Victor Stinner99b95382011-07-04 14:23:54 +02006791#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006792
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006793/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006794
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006795#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006796#define NEED_RETRY
6797#endif
6798
Victor Stinner3a50e702011-10-18 21:21:00 +02006799#ifndef WC_ERR_INVALID_CHARS
6800# define WC_ERR_INVALID_CHARS 0x0080
6801#endif
6802
6803static char*
6804code_page_name(UINT code_page, PyObject **obj)
6805{
6806 *obj = NULL;
6807 if (code_page == CP_ACP)
6808 return "mbcs";
6809 if (code_page == CP_UTF7)
6810 return "CP_UTF7";
6811 if (code_page == CP_UTF8)
6812 return "CP_UTF8";
6813
6814 *obj = PyBytes_FromFormat("cp%u", code_page);
6815 if (*obj == NULL)
6816 return NULL;
6817 return PyBytes_AS_STRING(*obj);
6818}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006819
Victor Stinner3a50e702011-10-18 21:21:00 +02006820static DWORD
6821decode_code_page_flags(UINT code_page)
6822{
6823 if (code_page == CP_UTF7) {
6824 /* The CP_UTF7 decoder only supports flags=0 */
6825 return 0;
6826 }
6827 else
6828 return MB_ERR_INVALID_CHARS;
6829}
6830
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006831/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006832 * Decode a byte string from a Windows code page into unicode object in strict
6833 * mode.
6834 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006835 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6836 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006837 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006838static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006839decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006840 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006841 const char *in,
6842 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006843{
Victor Stinner3a50e702011-10-18 21:21:00 +02006844 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006845 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006846 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006847
6848 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006849 assert(insize > 0);
6850 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6851 if (outsize <= 0)
6852 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006853
6854 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006855 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006856 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006857 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006858 if (*v == NULL)
6859 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006860 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006861 }
6862 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006863 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006864 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006865 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006866 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006867 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006868 }
6869
6870 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006871 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6872 if (outsize <= 0)
6873 goto error;
6874 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006875
Victor Stinner3a50e702011-10-18 21:21:00 +02006876error:
6877 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6878 return -2;
6879 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006880 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006881}
6882
Victor Stinner3a50e702011-10-18 21:21:00 +02006883/*
6884 * Decode a byte string from a code page into unicode object with an error
6885 * handler.
6886 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006887 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006888 * UnicodeDecodeError exception and returns -1 on error.
6889 */
6890static int
6891decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006892 PyObject **v,
6893 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01006894 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02006895{
6896 const char *startin = in;
6897 const char *endin = in + size;
6898 const DWORD flags = decode_code_page_flags(code_page);
6899 /* Ideally, we should get reason from FormatMessage. This is the Windows
6900 2000 English version of the message. */
6901 const char *reason = "No mapping for the Unicode character exists "
6902 "in the target code page.";
6903 /* each step cannot decode more than 1 character, but a character can be
6904 represented as a surrogate pair */
6905 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02006906 int insize;
6907 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006908 PyObject *errorHandler = NULL;
6909 PyObject *exc = NULL;
6910 PyObject *encoding_obj = NULL;
6911 char *encoding;
6912 DWORD err;
6913 int ret = -1;
6914
6915 assert(size > 0);
6916
6917 encoding = code_page_name(code_page, &encoding_obj);
6918 if (encoding == NULL)
6919 return -1;
6920
Victor Stinner7d00cc12014-03-17 23:08:06 +01006921 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02006922 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6923 UnicodeDecodeError. */
6924 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6925 if (exc != NULL) {
6926 PyCodec_StrictErrors(exc);
6927 Py_CLEAR(exc);
6928 }
6929 goto error;
6930 }
6931
6932 if (*v == NULL) {
6933 /* Create unicode object */
6934 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6935 PyErr_NoMemory();
6936 goto error;
6937 }
Victor Stinnerab595942011-12-17 04:59:06 +01006938 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006939 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006940 if (*v == NULL)
6941 goto error;
6942 startout = PyUnicode_AS_UNICODE(*v);
6943 }
6944 else {
6945 /* Extend unicode object */
6946 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6947 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6948 PyErr_NoMemory();
6949 goto error;
6950 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006951 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006952 goto error;
6953 startout = PyUnicode_AS_UNICODE(*v) + n;
6954 }
6955
6956 /* Decode the byte string character per character */
6957 out = startout;
6958 while (in < endin)
6959 {
6960 /* Decode a character */
6961 insize = 1;
6962 do
6963 {
6964 outsize = MultiByteToWideChar(code_page, flags,
6965 in, insize,
6966 buffer, Py_ARRAY_LENGTH(buffer));
6967 if (outsize > 0)
6968 break;
6969 err = GetLastError();
6970 if (err != ERROR_NO_UNICODE_TRANSLATION
6971 && err != ERROR_INSUFFICIENT_BUFFER)
6972 {
6973 PyErr_SetFromWindowsErr(0);
6974 goto error;
6975 }
6976 insize++;
6977 }
6978 /* 4=maximum length of a UTF-8 sequence */
6979 while (insize <= 4 && (in + insize) <= endin);
6980
6981 if (outsize <= 0) {
6982 Py_ssize_t startinpos, endinpos, outpos;
6983
Victor Stinner7d00cc12014-03-17 23:08:06 +01006984 /* last character in partial decode? */
6985 if (in + insize >= endin && !final)
6986 break;
6987
Victor Stinner3a50e702011-10-18 21:21:00 +02006988 startinpos = in - startin;
6989 endinpos = startinpos + 1;
6990 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006991 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006992 errors, &errorHandler,
6993 encoding, reason,
6994 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006995 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006996 {
6997 goto error;
6998 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006999 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007000 }
7001 else {
7002 in += insize;
7003 memcpy(out, buffer, outsize * sizeof(wchar_t));
7004 out += outsize;
7005 }
7006 }
7007
7008 /* write a NUL character at the end */
7009 *out = 0;
7010
7011 /* Extend unicode object */
7012 outsize = out - startout;
7013 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01007014 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02007015 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007016 /* (in - startin) <= size and size is an int */
7017 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007018
7019error:
7020 Py_XDECREF(encoding_obj);
7021 Py_XDECREF(errorHandler);
7022 Py_XDECREF(exc);
7023 return ret;
7024}
7025
Victor Stinner3a50e702011-10-18 21:21:00 +02007026static PyObject *
7027decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007028 const char *s, Py_ssize_t size,
7029 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007030{
Victor Stinner76a31a62011-11-04 00:05:13 +01007031 PyObject *v = NULL;
7032 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007033
Victor Stinner3a50e702011-10-18 21:21:00 +02007034 if (code_page < 0) {
7035 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7036 return NULL;
7037 }
7038
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007039 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007040 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007041
Victor Stinner76a31a62011-11-04 00:05:13 +01007042 do
7043 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007044#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007045 if (size > INT_MAX) {
7046 chunk_size = INT_MAX;
7047 final = 0;
7048 done = 0;
7049 }
7050 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007051#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007052 {
7053 chunk_size = (int)size;
7054 final = (consumed == NULL);
7055 done = 1;
7056 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007057
Victor Stinner76a31a62011-11-04 00:05:13 +01007058 if (chunk_size == 0 && done) {
7059 if (v != NULL)
7060 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007061 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007062 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007063
Victor Stinner76a31a62011-11-04 00:05:13 +01007064 converted = decode_code_page_strict(code_page, &v,
7065 s, chunk_size);
7066 if (converted == -2)
7067 converted = decode_code_page_errors(code_page, &v,
7068 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007069 errors, final);
7070 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007071
7072 if (converted < 0) {
7073 Py_XDECREF(v);
7074 return NULL;
7075 }
7076
7077 if (consumed)
7078 *consumed += converted;
7079
7080 s += converted;
7081 size -= converted;
7082 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007083
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007084 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007085}
7086
Alexander Belopolsky40018472011-02-26 01:02:56 +00007087PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007088PyUnicode_DecodeCodePageStateful(int code_page,
7089 const char *s,
7090 Py_ssize_t size,
7091 const char *errors,
7092 Py_ssize_t *consumed)
7093{
7094 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7095}
7096
7097PyObject *
7098PyUnicode_DecodeMBCSStateful(const char *s,
7099 Py_ssize_t size,
7100 const char *errors,
7101 Py_ssize_t *consumed)
7102{
7103 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7104}
7105
7106PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007107PyUnicode_DecodeMBCS(const char *s,
7108 Py_ssize_t size,
7109 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007110{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007111 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7112}
7113
Victor Stinner3a50e702011-10-18 21:21:00 +02007114static DWORD
7115encode_code_page_flags(UINT code_page, const char *errors)
7116{
7117 if (code_page == CP_UTF8) {
7118 if (winver.dwMajorVersion >= 6)
7119 /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7120 and later */
7121 return WC_ERR_INVALID_CHARS;
7122 else
7123 /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7124 return 0;
7125 }
7126 else if (code_page == CP_UTF7) {
7127 /* CP_UTF7 only supports flags=0 */
7128 return 0;
7129 }
7130 else {
7131 if (errors != NULL && strcmp(errors, "replace") == 0)
7132 return 0;
7133 else
7134 return WC_NO_BEST_FIT_CHARS;
7135 }
7136}
7137
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007138/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007139 * Encode a Unicode string to a Windows code page into a byte string in strict
7140 * mode.
7141 *
7142 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007143 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007144 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007145static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007146encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007147 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007148 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007149{
Victor Stinner554f3f02010-06-16 23:33:54 +00007150 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007151 BOOL *pusedDefaultChar = &usedDefaultChar;
7152 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007153 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007154 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007155 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007156 const DWORD flags = encode_code_page_flags(code_page, NULL);
7157 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007158 /* Create a substring so that we can get the UTF-16 representation
7159 of just the slice under consideration. */
7160 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007161
Martin v. Löwis3d325192011-11-04 18:23:06 +01007162 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007163
Victor Stinner3a50e702011-10-18 21:21:00 +02007164 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007165 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007166 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007167 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007168
Victor Stinner2fc507f2011-11-04 20:06:39 +01007169 substring = PyUnicode_Substring(unicode, offset, offset+len);
7170 if (substring == NULL)
7171 return -1;
7172 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7173 if (p == NULL) {
7174 Py_DECREF(substring);
7175 return -1;
7176 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007177 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007178
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007179 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007180 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007181 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007182 NULL, 0,
7183 NULL, pusedDefaultChar);
7184 if (outsize <= 0)
7185 goto error;
7186 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007187 if (pusedDefaultChar && *pusedDefaultChar) {
7188 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007189 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007190 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007191
Victor Stinner3a50e702011-10-18 21:21:00 +02007192 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007193 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007194 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007195 if (*outbytes == NULL) {
7196 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007197 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007198 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007199 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007200 }
7201 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007202 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007203 const Py_ssize_t n = PyBytes_Size(*outbytes);
7204 if (outsize > PY_SSIZE_T_MAX - n) {
7205 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007206 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007207 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007208 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007209 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7210 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007211 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007212 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007213 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007214 }
7215
7216 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007217 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007218 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007219 out, outsize,
7220 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007221 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007222 if (outsize <= 0)
7223 goto error;
7224 if (pusedDefaultChar && *pusedDefaultChar)
7225 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007226 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007227
Victor Stinner3a50e702011-10-18 21:21:00 +02007228error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007229 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007230 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7231 return -2;
7232 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007233 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007234}
7235
Victor Stinner3a50e702011-10-18 21:21:00 +02007236/*
7237 * Encode a Unicode string to a Windows code page into a byte string using a
7238 * error handler.
7239 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007240 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007241 * -1 on other error.
7242 */
7243static int
7244encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007245 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007246 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007247{
Victor Stinner3a50e702011-10-18 21:21:00 +02007248 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007249 Py_ssize_t pos = unicode_offset;
7250 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007251 /* Ideally, we should get reason from FormatMessage. This is the Windows
7252 2000 English version of the message. */
7253 const char *reason = "invalid character";
7254 /* 4=maximum length of a UTF-8 sequence */
7255 char buffer[4];
7256 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7257 Py_ssize_t outsize;
7258 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007259 PyObject *errorHandler = NULL;
7260 PyObject *exc = NULL;
7261 PyObject *encoding_obj = NULL;
7262 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007263 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007264 PyObject *rep;
7265 int ret = -1;
7266
7267 assert(insize > 0);
7268
7269 encoding = code_page_name(code_page, &encoding_obj);
7270 if (encoding == NULL)
7271 return -1;
7272
7273 if (errors == NULL || strcmp(errors, "strict") == 0) {
7274 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7275 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007276 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007277 if (exc != NULL) {
7278 PyCodec_StrictErrors(exc);
7279 Py_DECREF(exc);
7280 }
7281 Py_XDECREF(encoding_obj);
7282 return -1;
7283 }
7284
7285 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7286 pusedDefaultChar = &usedDefaultChar;
7287 else
7288 pusedDefaultChar = NULL;
7289
7290 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7291 PyErr_NoMemory();
7292 goto error;
7293 }
7294 outsize = insize * Py_ARRAY_LENGTH(buffer);
7295
7296 if (*outbytes == NULL) {
7297 /* Create string object */
7298 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7299 if (*outbytes == NULL)
7300 goto error;
7301 out = PyBytes_AS_STRING(*outbytes);
7302 }
7303 else {
7304 /* Extend string object */
7305 Py_ssize_t n = PyBytes_Size(*outbytes);
7306 if (n > PY_SSIZE_T_MAX - outsize) {
7307 PyErr_NoMemory();
7308 goto error;
7309 }
7310 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7311 goto error;
7312 out = PyBytes_AS_STRING(*outbytes) + n;
7313 }
7314
7315 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007316 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007317 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007318 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7319 wchar_t chars[2];
7320 int charsize;
7321 if (ch < 0x10000) {
7322 chars[0] = (wchar_t)ch;
7323 charsize = 1;
7324 }
7325 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007326 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7327 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007328 charsize = 2;
7329 }
7330
Victor Stinner3a50e702011-10-18 21:21:00 +02007331 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007332 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007333 buffer, Py_ARRAY_LENGTH(buffer),
7334 NULL, pusedDefaultChar);
7335 if (outsize > 0) {
7336 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7337 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007338 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007339 memcpy(out, buffer, outsize);
7340 out += outsize;
7341 continue;
7342 }
7343 }
7344 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7345 PyErr_SetFromWindowsErr(0);
7346 goto error;
7347 }
7348
Victor Stinner3a50e702011-10-18 21:21:00 +02007349 rep = unicode_encode_call_errorhandler(
7350 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007351 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007352 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007353 if (rep == NULL)
7354 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007355 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007356
7357 if (PyBytes_Check(rep)) {
7358 outsize = PyBytes_GET_SIZE(rep);
7359 if (outsize != 1) {
7360 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7361 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7362 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7363 Py_DECREF(rep);
7364 goto error;
7365 }
7366 out = PyBytes_AS_STRING(*outbytes) + offset;
7367 }
7368 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7369 out += outsize;
7370 }
7371 else {
7372 Py_ssize_t i;
7373 enum PyUnicode_Kind kind;
7374 void *data;
7375
Benjamin Petersonbac79492012-01-14 13:34:47 -05007376 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007377 Py_DECREF(rep);
7378 goto error;
7379 }
7380
7381 outsize = PyUnicode_GET_LENGTH(rep);
7382 if (outsize != 1) {
7383 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7384 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7385 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7386 Py_DECREF(rep);
7387 goto error;
7388 }
7389 out = PyBytes_AS_STRING(*outbytes) + offset;
7390 }
7391 kind = PyUnicode_KIND(rep);
7392 data = PyUnicode_DATA(rep);
7393 for (i=0; i < outsize; i++) {
7394 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7395 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007396 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007397 encoding, unicode,
7398 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007399 "unable to encode error handler result to ASCII");
7400 Py_DECREF(rep);
7401 goto error;
7402 }
7403 *out = (unsigned char)ch;
7404 out++;
7405 }
7406 }
7407 Py_DECREF(rep);
7408 }
7409 /* write a NUL byte */
7410 *out = 0;
7411 outsize = out - PyBytes_AS_STRING(*outbytes);
7412 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7413 if (_PyBytes_Resize(outbytes, outsize) < 0)
7414 goto error;
7415 ret = 0;
7416
7417error:
7418 Py_XDECREF(encoding_obj);
7419 Py_XDECREF(errorHandler);
7420 Py_XDECREF(exc);
7421 return ret;
7422}
7423
Victor Stinner3a50e702011-10-18 21:21:00 +02007424static PyObject *
7425encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007426 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007427 const char *errors)
7428{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007429 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007430 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007431 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007432 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007433
Benjamin Petersonbac79492012-01-14 13:34:47 -05007434 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007435 return NULL;
7436 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007437
Victor Stinner3a50e702011-10-18 21:21:00 +02007438 if (code_page < 0) {
7439 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7440 return NULL;
7441 }
7442
Martin v. Löwis3d325192011-11-04 18:23:06 +01007443 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007444 return PyBytes_FromStringAndSize(NULL, 0);
7445
Victor Stinner7581cef2011-11-03 22:32:33 +01007446 offset = 0;
7447 do
7448 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007449#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007450 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007451 chunks. */
7452 if (len > INT_MAX/2) {
7453 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007454 done = 0;
7455 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007456 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007457#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007458 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007459 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007460 done = 1;
7461 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007462
Victor Stinner76a31a62011-11-04 00:05:13 +01007463 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007464 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007465 errors);
7466 if (ret == -2)
7467 ret = encode_code_page_errors(code_page, &outbytes,
7468 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007469 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007470 if (ret < 0) {
7471 Py_XDECREF(outbytes);
7472 return NULL;
7473 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007474
Victor Stinner7581cef2011-11-03 22:32:33 +01007475 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007476 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007477 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007478
Victor Stinner3a50e702011-10-18 21:21:00 +02007479 return outbytes;
7480}
7481
7482PyObject *
7483PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7484 Py_ssize_t size,
7485 const char *errors)
7486{
Victor Stinner7581cef2011-11-03 22:32:33 +01007487 PyObject *unicode, *res;
7488 unicode = PyUnicode_FromUnicode(p, size);
7489 if (unicode == NULL)
7490 return NULL;
7491 res = encode_code_page(CP_ACP, unicode, errors);
7492 Py_DECREF(unicode);
7493 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007494}
7495
7496PyObject *
7497PyUnicode_EncodeCodePage(int code_page,
7498 PyObject *unicode,
7499 const char *errors)
7500{
Victor Stinner7581cef2011-11-03 22:32:33 +01007501 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007502}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007503
Alexander Belopolsky40018472011-02-26 01:02:56 +00007504PyObject *
7505PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007506{
7507 if (!PyUnicode_Check(unicode)) {
7508 PyErr_BadArgument();
7509 return NULL;
7510 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007511 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007512}
7513
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007514#undef NEED_RETRY
7515
Victor Stinner99b95382011-07-04 14:23:54 +02007516#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007517
Guido van Rossumd57fd912000-03-10 22:53:23 +00007518/* --- Character Mapping Codec -------------------------------------------- */
7519
Victor Stinnerfb161b12013-04-18 01:44:27 +02007520static int
7521charmap_decode_string(const char *s,
7522 Py_ssize_t size,
7523 PyObject *mapping,
7524 const char *errors,
7525 _PyUnicodeWriter *writer)
7526{
7527 const char *starts = s;
7528 const char *e;
7529 Py_ssize_t startinpos, endinpos;
7530 PyObject *errorHandler = NULL, *exc = NULL;
7531 Py_ssize_t maplen;
7532 enum PyUnicode_Kind mapkind;
7533 void *mapdata;
7534 Py_UCS4 x;
7535 unsigned char ch;
7536
7537 if (PyUnicode_READY(mapping) == -1)
7538 return -1;
7539
7540 maplen = PyUnicode_GET_LENGTH(mapping);
7541 mapdata = PyUnicode_DATA(mapping);
7542 mapkind = PyUnicode_KIND(mapping);
7543
7544 e = s + size;
7545
7546 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7547 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7548 * is disabled in encoding aliases, latin1 is preferred because
7549 * its implementation is faster. */
7550 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7551 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7552 Py_UCS4 maxchar = writer->maxchar;
7553
7554 assert (writer->kind == PyUnicode_1BYTE_KIND);
7555 while (s < e) {
7556 ch = *s;
7557 x = mapdata_ucs1[ch];
7558 if (x > maxchar) {
7559 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7560 goto onError;
7561 maxchar = writer->maxchar;
7562 outdata = (Py_UCS1 *)writer->data;
7563 }
7564 outdata[writer->pos] = x;
7565 writer->pos++;
7566 ++s;
7567 }
7568 return 0;
7569 }
7570
7571 while (s < e) {
7572 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7573 enum PyUnicode_Kind outkind = writer->kind;
7574 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7575 if (outkind == PyUnicode_1BYTE_KIND) {
7576 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7577 Py_UCS4 maxchar = writer->maxchar;
7578 while (s < e) {
7579 ch = *s;
7580 x = mapdata_ucs2[ch];
7581 if (x > maxchar)
7582 goto Error;
7583 outdata[writer->pos] = x;
7584 writer->pos++;
7585 ++s;
7586 }
7587 break;
7588 }
7589 else if (outkind == PyUnicode_2BYTE_KIND) {
7590 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7591 while (s < e) {
7592 ch = *s;
7593 x = mapdata_ucs2[ch];
7594 if (x == 0xFFFE)
7595 goto Error;
7596 outdata[writer->pos] = x;
7597 writer->pos++;
7598 ++s;
7599 }
7600 break;
7601 }
7602 }
7603 ch = *s;
7604
7605 if (ch < maplen)
7606 x = PyUnicode_READ(mapkind, mapdata, ch);
7607 else
7608 x = 0xfffe; /* invalid value */
7609Error:
7610 if (x == 0xfffe)
7611 {
7612 /* undefined mapping */
7613 startinpos = s-starts;
7614 endinpos = startinpos+1;
7615 if (unicode_decode_call_errorhandler_writer(
7616 errors, &errorHandler,
7617 "charmap", "character maps to <undefined>",
7618 &starts, &e, &startinpos, &endinpos, &exc, &s,
7619 writer)) {
7620 goto onError;
7621 }
7622 continue;
7623 }
7624
7625 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7626 goto onError;
7627 ++s;
7628 }
7629 Py_XDECREF(errorHandler);
7630 Py_XDECREF(exc);
7631 return 0;
7632
7633onError:
7634 Py_XDECREF(errorHandler);
7635 Py_XDECREF(exc);
7636 return -1;
7637}
7638
7639static int
7640charmap_decode_mapping(const char *s,
7641 Py_ssize_t size,
7642 PyObject *mapping,
7643 const char *errors,
7644 _PyUnicodeWriter *writer)
7645{
7646 const char *starts = s;
7647 const char *e;
7648 Py_ssize_t startinpos, endinpos;
7649 PyObject *errorHandler = NULL, *exc = NULL;
7650 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007651 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007652
7653 e = s + size;
7654
7655 while (s < e) {
7656 ch = *s;
7657
7658 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7659 key = PyLong_FromLong((long)ch);
7660 if (key == NULL)
7661 goto onError;
7662
7663 item = PyObject_GetItem(mapping, key);
7664 Py_DECREF(key);
7665 if (item == NULL) {
7666 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7667 /* No mapping found means: mapping is undefined. */
7668 PyErr_Clear();
7669 goto Undefined;
7670 } else
7671 goto onError;
7672 }
7673
7674 /* Apply mapping */
7675 if (item == Py_None)
7676 goto Undefined;
7677 if (PyLong_Check(item)) {
7678 long value = PyLong_AS_LONG(item);
7679 if (value == 0xFFFE)
7680 goto Undefined;
7681 if (value < 0 || value > MAX_UNICODE) {
7682 PyErr_Format(PyExc_TypeError,
7683 "character mapping must be in range(0x%lx)",
7684 (unsigned long)MAX_UNICODE + 1);
7685 goto onError;
7686 }
7687
7688 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7689 goto onError;
7690 }
7691 else if (PyUnicode_Check(item)) {
7692 if (PyUnicode_READY(item) == -1)
7693 goto onError;
7694 if (PyUnicode_GET_LENGTH(item) == 1) {
7695 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7696 if (value == 0xFFFE)
7697 goto Undefined;
7698 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7699 goto onError;
7700 }
7701 else {
7702 writer->overallocate = 1;
7703 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7704 goto onError;
7705 }
7706 }
7707 else {
7708 /* wrong return value */
7709 PyErr_SetString(PyExc_TypeError,
7710 "character mapping must return integer, None or str");
7711 goto onError;
7712 }
7713 Py_CLEAR(item);
7714 ++s;
7715 continue;
7716
7717Undefined:
7718 /* undefined mapping */
7719 Py_CLEAR(item);
7720 startinpos = s-starts;
7721 endinpos = startinpos+1;
7722 if (unicode_decode_call_errorhandler_writer(
7723 errors, &errorHandler,
7724 "charmap", "character maps to <undefined>",
7725 &starts, &e, &startinpos, &endinpos, &exc, &s,
7726 writer)) {
7727 goto onError;
7728 }
7729 }
7730 Py_XDECREF(errorHandler);
7731 Py_XDECREF(exc);
7732 return 0;
7733
7734onError:
7735 Py_XDECREF(item);
7736 Py_XDECREF(errorHandler);
7737 Py_XDECREF(exc);
7738 return -1;
7739}
7740
Alexander Belopolsky40018472011-02-26 01:02:56 +00007741PyObject *
7742PyUnicode_DecodeCharmap(const char *s,
7743 Py_ssize_t size,
7744 PyObject *mapping,
7745 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007746{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007747 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007748
Guido van Rossumd57fd912000-03-10 22:53:23 +00007749 /* Default to Latin-1 */
7750 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007751 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007752
Guido van Rossumd57fd912000-03-10 22:53:23 +00007753 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007754 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007755 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007756 writer.min_length = size;
7757 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007758 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007759
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007760 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007761 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7762 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007763 }
7764 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007765 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7766 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007767 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007768 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007769
Benjamin Peterson29060642009-01-31 22:14:21 +00007770 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007771 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007772 return NULL;
7773}
7774
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007775/* Charmap encoding: the lookup table */
7776
Alexander Belopolsky40018472011-02-26 01:02:56 +00007777struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007778 PyObject_HEAD
7779 unsigned char level1[32];
7780 int count2, count3;
7781 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007782};
7783
7784static PyObject*
7785encoding_map_size(PyObject *obj, PyObject* args)
7786{
7787 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007788 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007789 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007790}
7791
7792static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007793 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007794 PyDoc_STR("Return the size (in bytes) of this object") },
7795 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007796};
7797
7798static void
7799encoding_map_dealloc(PyObject* o)
7800{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007801 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007802}
7803
7804static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007805 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007806 "EncodingMap", /*tp_name*/
7807 sizeof(struct encoding_map), /*tp_basicsize*/
7808 0, /*tp_itemsize*/
7809 /* methods */
7810 encoding_map_dealloc, /*tp_dealloc*/
7811 0, /*tp_print*/
7812 0, /*tp_getattr*/
7813 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007814 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007815 0, /*tp_repr*/
7816 0, /*tp_as_number*/
7817 0, /*tp_as_sequence*/
7818 0, /*tp_as_mapping*/
7819 0, /*tp_hash*/
7820 0, /*tp_call*/
7821 0, /*tp_str*/
7822 0, /*tp_getattro*/
7823 0, /*tp_setattro*/
7824 0, /*tp_as_buffer*/
7825 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7826 0, /*tp_doc*/
7827 0, /*tp_traverse*/
7828 0, /*tp_clear*/
7829 0, /*tp_richcompare*/
7830 0, /*tp_weaklistoffset*/
7831 0, /*tp_iter*/
7832 0, /*tp_iternext*/
7833 encoding_map_methods, /*tp_methods*/
7834 0, /*tp_members*/
7835 0, /*tp_getset*/
7836 0, /*tp_base*/
7837 0, /*tp_dict*/
7838 0, /*tp_descr_get*/
7839 0, /*tp_descr_set*/
7840 0, /*tp_dictoffset*/
7841 0, /*tp_init*/
7842 0, /*tp_alloc*/
7843 0, /*tp_new*/
7844 0, /*tp_free*/
7845 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007846};
7847
7848PyObject*
7849PyUnicode_BuildEncodingMap(PyObject* string)
7850{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007851 PyObject *result;
7852 struct encoding_map *mresult;
7853 int i;
7854 int need_dict = 0;
7855 unsigned char level1[32];
7856 unsigned char level2[512];
7857 unsigned char *mlevel1, *mlevel2, *mlevel3;
7858 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007859 int kind;
7860 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007861 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007862 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007863
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007864 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007865 PyErr_BadArgument();
7866 return NULL;
7867 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007868 kind = PyUnicode_KIND(string);
7869 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007870 length = PyUnicode_GET_LENGTH(string);
7871 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007872 memset(level1, 0xFF, sizeof level1);
7873 memset(level2, 0xFF, sizeof level2);
7874
7875 /* If there isn't a one-to-one mapping of NULL to \0,
7876 or if there are non-BMP characters, we need to use
7877 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007878 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007879 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007880 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007881 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007882 ch = PyUnicode_READ(kind, data, i);
7883 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007884 need_dict = 1;
7885 break;
7886 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007887 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007888 /* unmapped character */
7889 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007890 l1 = ch >> 11;
7891 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007892 if (level1[l1] == 0xFF)
7893 level1[l1] = count2++;
7894 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007895 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007896 }
7897
7898 if (count2 >= 0xFF || count3 >= 0xFF)
7899 need_dict = 1;
7900
7901 if (need_dict) {
7902 PyObject *result = PyDict_New();
7903 PyObject *key, *value;
7904 if (!result)
7905 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007906 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007907 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007908 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007909 if (!key || !value)
7910 goto failed1;
7911 if (PyDict_SetItem(result, key, value) == -1)
7912 goto failed1;
7913 Py_DECREF(key);
7914 Py_DECREF(value);
7915 }
7916 return result;
7917 failed1:
7918 Py_XDECREF(key);
7919 Py_XDECREF(value);
7920 Py_DECREF(result);
7921 return NULL;
7922 }
7923
7924 /* Create a three-level trie */
7925 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7926 16*count2 + 128*count3 - 1);
7927 if (!result)
7928 return PyErr_NoMemory();
7929 PyObject_Init(result, &EncodingMapType);
7930 mresult = (struct encoding_map*)result;
7931 mresult->count2 = count2;
7932 mresult->count3 = count3;
7933 mlevel1 = mresult->level1;
7934 mlevel2 = mresult->level23;
7935 mlevel3 = mresult->level23 + 16*count2;
7936 memcpy(mlevel1, level1, 32);
7937 memset(mlevel2, 0xFF, 16*count2);
7938 memset(mlevel3, 0, 128*count3);
7939 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007940 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007941 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007942 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7943 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007944 /* unmapped character */
7945 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007946 o1 = ch>>11;
7947 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007948 i2 = 16*mlevel1[o1] + o2;
7949 if (mlevel2[i2] == 0xFF)
7950 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007951 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007952 i3 = 128*mlevel2[i2] + o3;
7953 mlevel3[i3] = i;
7954 }
7955 return result;
7956}
7957
7958static int
Victor Stinner22168992011-11-20 17:09:18 +01007959encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007960{
7961 struct encoding_map *map = (struct encoding_map*)mapping;
7962 int l1 = c>>11;
7963 int l2 = (c>>7) & 0xF;
7964 int l3 = c & 0x7F;
7965 int i;
7966
Victor Stinner22168992011-11-20 17:09:18 +01007967 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007968 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007969 if (c == 0)
7970 return 0;
7971 /* level 1*/
7972 i = map->level1[l1];
7973 if (i == 0xFF) {
7974 return -1;
7975 }
7976 /* level 2*/
7977 i = map->level23[16*i+l2];
7978 if (i == 0xFF) {
7979 return -1;
7980 }
7981 /* level 3 */
7982 i = map->level23[16*map->count2 + 128*i + l3];
7983 if (i == 0) {
7984 return -1;
7985 }
7986 return i;
7987}
7988
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007989/* Lookup the character ch in the mapping. If the character
7990 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007991 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007992static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007993charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007994{
Christian Heimes217cfd12007-12-02 14:31:20 +00007995 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007996 PyObject *x;
7997
7998 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007999 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008000 x = PyObject_GetItem(mapping, w);
8001 Py_DECREF(w);
8002 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008003 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8004 /* No mapping found means: mapping is undefined. */
8005 PyErr_Clear();
8006 x = Py_None;
8007 Py_INCREF(x);
8008 return x;
8009 } else
8010 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008011 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008012 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008013 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008014 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008015 long value = PyLong_AS_LONG(x);
8016 if (value < 0 || value > 255) {
8017 PyErr_SetString(PyExc_TypeError,
8018 "character mapping must be in range(256)");
8019 Py_DECREF(x);
8020 return NULL;
8021 }
8022 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008023 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008024 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008025 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008026 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008027 /* wrong return value */
8028 PyErr_Format(PyExc_TypeError,
8029 "character mapping must return integer, bytes or None, not %.400s",
8030 x->ob_type->tp_name);
8031 Py_DECREF(x);
8032 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008033 }
8034}
8035
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008036static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008037charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008038{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008039 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8040 /* exponentially overallocate to minimize reallocations */
8041 if (requiredsize < 2*outsize)
8042 requiredsize = 2*outsize;
8043 if (_PyBytes_Resize(outobj, requiredsize))
8044 return -1;
8045 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008046}
8047
Benjamin Peterson14339b62009-01-31 16:36:08 +00008048typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008049 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008050} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008051/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008052 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008053 space is available. Return a new reference to the object that
8054 was put in the output buffer, or Py_None, if the mapping was undefined
8055 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008056 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008057static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008058charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008059 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008060{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008061 PyObject *rep;
8062 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008063 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008064
Christian Heimes90aa7642007-12-19 02:45:37 +00008065 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008066 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008067 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008068 if (res == -1)
8069 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008070 if (outsize<requiredsize)
8071 if (charmapencode_resize(outobj, outpos, requiredsize))
8072 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008073 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008074 outstart[(*outpos)++] = (char)res;
8075 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008076 }
8077
8078 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008079 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008080 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008081 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008082 Py_DECREF(rep);
8083 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008084 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008085 if (PyLong_Check(rep)) {
8086 Py_ssize_t requiredsize = *outpos+1;
8087 if (outsize<requiredsize)
8088 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8089 Py_DECREF(rep);
8090 return enc_EXCEPTION;
8091 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008092 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008093 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008094 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008095 else {
8096 const char *repchars = PyBytes_AS_STRING(rep);
8097 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8098 Py_ssize_t requiredsize = *outpos+repsize;
8099 if (outsize<requiredsize)
8100 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8101 Py_DECREF(rep);
8102 return enc_EXCEPTION;
8103 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008104 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008105 memcpy(outstart + *outpos, repchars, repsize);
8106 *outpos += repsize;
8107 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008108 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008109 Py_DECREF(rep);
8110 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008111}
8112
8113/* handle an error in PyUnicode_EncodeCharmap
8114 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008115static int
8116charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008117 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008118 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008119 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008120 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008121{
8122 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008123 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008124 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008125 enum PyUnicode_Kind kind;
8126 void *data;
8127 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008128 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008129 Py_ssize_t collstartpos = *inpos;
8130 Py_ssize_t collendpos = *inpos+1;
8131 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008132 char *encoding = "charmap";
8133 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008134 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008135 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008136 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008137
Benjamin Petersonbac79492012-01-14 13:34:47 -05008138 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008139 return -1;
8140 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008141 /* find all unencodable characters */
8142 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008143 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008144 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008145 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008146 val = encoding_map_lookup(ch, mapping);
8147 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008148 break;
8149 ++collendpos;
8150 continue;
8151 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008152
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008153 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8154 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008155 if (rep==NULL)
8156 return -1;
8157 else if (rep!=Py_None) {
8158 Py_DECREF(rep);
8159 break;
8160 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008161 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008162 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008163 }
8164 /* cache callback name lookup
8165 * (if not done yet, i.e. it's the first error) */
8166 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008167 if ((errors==NULL) || (!strcmp(errors, "strict")))
8168 *known_errorHandler = 1;
8169 else if (!strcmp(errors, "replace"))
8170 *known_errorHandler = 2;
8171 else if (!strcmp(errors, "ignore"))
8172 *known_errorHandler = 3;
8173 else if (!strcmp(errors, "xmlcharrefreplace"))
8174 *known_errorHandler = 4;
8175 else
8176 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008177 }
8178 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008179 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008180 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008181 return -1;
8182 case 2: /* replace */
8183 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008184 x = charmapencode_output('?', mapping, res, respos);
8185 if (x==enc_EXCEPTION) {
8186 return -1;
8187 }
8188 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008189 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008190 return -1;
8191 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008192 }
8193 /* fall through */
8194 case 3: /* ignore */
8195 *inpos = collendpos;
8196 break;
8197 case 4: /* xmlcharrefreplace */
8198 /* generate replacement (temporarily (mis)uses p) */
8199 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008200 char buffer[2+29+1+1];
8201 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008202 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008203 for (cp = buffer; *cp; ++cp) {
8204 x = charmapencode_output(*cp, mapping, res, respos);
8205 if (x==enc_EXCEPTION)
8206 return -1;
8207 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008208 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008209 return -1;
8210 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008211 }
8212 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008213 *inpos = collendpos;
8214 break;
8215 default:
8216 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008217 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008218 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008219 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008220 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008221 if (PyBytes_Check(repunicode)) {
8222 /* Directly copy bytes result to output. */
8223 Py_ssize_t outsize = PyBytes_Size(*res);
8224 Py_ssize_t requiredsize;
8225 repsize = PyBytes_Size(repunicode);
8226 requiredsize = *respos + repsize;
8227 if (requiredsize > outsize)
8228 /* Make room for all additional bytes. */
8229 if (charmapencode_resize(res, respos, requiredsize)) {
8230 Py_DECREF(repunicode);
8231 return -1;
8232 }
8233 memcpy(PyBytes_AsString(*res) + *respos,
8234 PyBytes_AsString(repunicode), repsize);
8235 *respos += repsize;
8236 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008237 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008238 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008239 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008240 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008241 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008242 Py_DECREF(repunicode);
8243 return -1;
8244 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008245 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008246 data = PyUnicode_DATA(repunicode);
8247 kind = PyUnicode_KIND(repunicode);
8248 for (index = 0; index < repsize; index++) {
8249 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8250 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008251 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008252 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008253 return -1;
8254 }
8255 else if (x==enc_FAILED) {
8256 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008257 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008258 return -1;
8259 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008260 }
8261 *inpos = newpos;
8262 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008263 }
8264 return 0;
8265}
8266
Alexander Belopolsky40018472011-02-26 01:02:56 +00008267PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008268_PyUnicode_EncodeCharmap(PyObject *unicode,
8269 PyObject *mapping,
8270 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008271{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008272 /* output object */
8273 PyObject *res = NULL;
8274 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008275 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008276 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008277 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008278 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008279 PyObject *errorHandler = NULL;
8280 PyObject *exc = NULL;
8281 /* the following variable is used for caching string comparisons
8282 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8283 * 3=ignore, 4=xmlcharrefreplace */
8284 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008285 void *data;
8286 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008287
Benjamin Petersonbac79492012-01-14 13:34:47 -05008288 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008289 return NULL;
8290 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008291 data = PyUnicode_DATA(unicode);
8292 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008293
Guido van Rossumd57fd912000-03-10 22:53:23 +00008294 /* Default to Latin-1 */
8295 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008296 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008297
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008298 /* allocate enough for a simple encoding without
8299 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008300 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008301 if (res == NULL)
8302 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008303 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008304 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008305
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008306 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008307 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008308 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008309 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008310 if (x==enc_EXCEPTION) /* error */
8311 goto onError;
8312 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008313 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008314 &exc,
8315 &known_errorHandler, &errorHandler, errors,
8316 &res, &respos)) {
8317 goto onError;
8318 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008319 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008320 else
8321 /* done with this character => adjust input position */
8322 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008323 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008325 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008326 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008327 if (_PyBytes_Resize(&res, respos) < 0)
8328 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008329
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008330 Py_XDECREF(exc);
8331 Py_XDECREF(errorHandler);
8332 return res;
8333
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008335 Py_XDECREF(res);
8336 Py_XDECREF(exc);
8337 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008338 return NULL;
8339}
8340
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008341/* Deprecated */
8342PyObject *
8343PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8344 Py_ssize_t size,
8345 PyObject *mapping,
8346 const char *errors)
8347{
8348 PyObject *result;
8349 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8350 if (unicode == NULL)
8351 return NULL;
8352 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8353 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008354 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008355}
8356
Alexander Belopolsky40018472011-02-26 01:02:56 +00008357PyObject *
8358PyUnicode_AsCharmapString(PyObject *unicode,
8359 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008360{
8361 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008362 PyErr_BadArgument();
8363 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008364 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008365 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008366}
8367
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008368/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008369static void
8370make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008371 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008372 Py_ssize_t startpos, Py_ssize_t endpos,
8373 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008374{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008375 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008376 *exceptionObject = _PyUnicodeTranslateError_Create(
8377 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008378 }
8379 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8381 goto onError;
8382 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8383 goto onError;
8384 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8385 goto onError;
8386 return;
8387 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008388 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008389 }
8390}
8391
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008392/* error handling callback helper:
8393 build arguments, call the callback and check the arguments,
8394 put the result into newpos and return the replacement string, which
8395 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008396static PyObject *
8397unicode_translate_call_errorhandler(const char *errors,
8398 PyObject **errorHandler,
8399 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008400 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008401 Py_ssize_t startpos, Py_ssize_t endpos,
8402 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008403{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008404 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008405
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008406 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008407 PyObject *restuple;
8408 PyObject *resunicode;
8409
8410 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008411 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008412 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008413 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008414 }
8415
8416 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008417 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008418 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008419 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008420
8421 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008422 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008423 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008424 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008425 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008426 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008427 Py_DECREF(restuple);
8428 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008429 }
8430 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008431 &resunicode, &i_newpos)) {
8432 Py_DECREF(restuple);
8433 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008434 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008435 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008436 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008437 else
8438 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008439 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008440 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008441 Py_DECREF(restuple);
8442 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008443 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008444 Py_INCREF(resunicode);
8445 Py_DECREF(restuple);
8446 return resunicode;
8447}
8448
8449/* Lookup the character ch in the mapping and put the result in result,
8450 which must be decrefed by the caller.
8451 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008452static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008453charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008454{
Christian Heimes217cfd12007-12-02 14:31:20 +00008455 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008456 PyObject *x;
8457
8458 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008460 x = PyObject_GetItem(mapping, w);
8461 Py_DECREF(w);
8462 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8464 /* No mapping found means: use 1:1 mapping. */
8465 PyErr_Clear();
8466 *result = NULL;
8467 return 0;
8468 } else
8469 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008470 }
8471 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 *result = x;
8473 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008474 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008475 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008476 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008477 if (value < 0 || value > MAX_UNICODE) {
8478 PyErr_Format(PyExc_ValueError,
8479 "character mapping must be in range(0x%x)",
8480 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008481 Py_DECREF(x);
8482 return -1;
8483 }
8484 *result = x;
8485 return 0;
8486 }
8487 else if (PyUnicode_Check(x)) {
8488 *result = x;
8489 return 0;
8490 }
8491 else {
8492 /* wrong return value */
8493 PyErr_SetString(PyExc_TypeError,
8494 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008495 Py_DECREF(x);
8496 return -1;
8497 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008498}
Victor Stinner1194ea02014-04-04 19:37:40 +02008499
8500/* lookup the character, write the result into the writer.
8501 Return 1 if the result was written into the writer, return 0 if the mapping
8502 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008503static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008504charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8505 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008506{
Victor Stinner1194ea02014-04-04 19:37:40 +02008507 PyObject *item;
8508
8509 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008511
8512 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008513 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008514 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008515 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008517 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008518 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008519
8520 if (item == Py_None) {
8521 Py_DECREF(item);
8522 return 0;
8523 }
8524
8525 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008526 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8527 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8528 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008529 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8530 Py_DECREF(item);
8531 return -1;
8532 }
8533 Py_DECREF(item);
8534 return 1;
8535 }
8536
8537 if (!PyUnicode_Check(item)) {
8538 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008539 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008540 }
8541
8542 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8543 Py_DECREF(item);
8544 return -1;
8545 }
8546
8547 Py_DECREF(item);
8548 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008549}
8550
Victor Stinner89a76ab2014-04-05 11:44:04 +02008551static int
8552unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8553 Py_UCS1 *translate)
8554{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008555 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008556 int ret = 0;
8557
Victor Stinner89a76ab2014-04-05 11:44:04 +02008558 if (charmaptranslate_lookup(ch, mapping, &item)) {
8559 return -1;
8560 }
8561
8562 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008563 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008564 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008565 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008566 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008567 /* not found => default to 1:1 mapping */
8568 translate[ch] = ch;
8569 return 1;
8570 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008571 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008572 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008573 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8574 used it */
8575 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008576 /* invalid character or character outside ASCII:
8577 skip the fast translate */
8578 goto exit;
8579 }
8580 translate[ch] = (Py_UCS1)replace;
8581 }
8582 else if (PyUnicode_Check(item)) {
8583 Py_UCS4 replace;
8584
8585 if (PyUnicode_READY(item) == -1) {
8586 Py_DECREF(item);
8587 return -1;
8588 }
8589 if (PyUnicode_GET_LENGTH(item) != 1)
8590 goto exit;
8591
8592 replace = PyUnicode_READ_CHAR(item, 0);
8593 if (replace > 127)
8594 goto exit;
8595 translate[ch] = (Py_UCS1)replace;
8596 }
8597 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008598 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008599 goto exit;
8600 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008601 ret = 1;
8602
Benjamin Peterson1365de72014-04-07 20:15:41 -04008603 exit:
8604 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008605 return ret;
8606}
8607
8608/* Fast path for ascii => ascii translation. Return 1 if the whole string
8609 was translated into writer, return 0 if the input string was partially
8610 translated into writer, raise an exception and return -1 on error. */
8611static int
8612unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner872b2912014-04-05 14:27:07 +02008613 _PyUnicodeWriter *writer, int ignore)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008614{
Victor Stinner872b2912014-04-05 14:27:07 +02008615 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008616 Py_ssize_t len;
8617 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008618 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008619
8620 if (PyUnicode_READY(input) == -1)
8621 return -1;
8622 if (!PyUnicode_IS_ASCII(input))
8623 return 0;
8624 len = PyUnicode_GET_LENGTH(input);
8625
Victor Stinner872b2912014-04-05 14:27:07 +02008626 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008627
8628 in = PyUnicode_1BYTE_DATA(input);
8629 end = in + len;
8630
8631 assert(PyUnicode_IS_ASCII(writer->buffer));
8632 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8633 out = PyUnicode_1BYTE_DATA(writer->buffer);
8634
Victor Stinner872b2912014-04-05 14:27:07 +02008635 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008636 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008637 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008638 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008639 int translate = unicode_fast_translate_lookup(mapping, ch,
8640 ascii_table);
8641 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008642 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008643 if (translate == 0)
8644 goto exit;
8645 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008646 }
Victor Stinner872b2912014-04-05 14:27:07 +02008647 if (ch2 == 0xfe) {
8648 if (ignore)
8649 continue;
8650 goto exit;
8651 }
8652 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008653 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008654 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008655 }
Victor Stinner872b2912014-04-05 14:27:07 +02008656 res = 1;
8657
8658exit:
8659 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8660 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008661}
8662
Alexander Belopolsky40018472011-02-26 01:02:56 +00008663PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008664_PyUnicode_TranslateCharmap(PyObject *input,
8665 PyObject *mapping,
8666 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008667{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008668 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008669 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008670 Py_ssize_t size, i;
8671 int kind;
8672 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008673 _PyUnicodeWriter writer;
8674 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008675 char *reason = "character maps to <undefined>";
8676 PyObject *errorHandler = NULL;
8677 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008678 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008679 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008680
Guido van Rossumd57fd912000-03-10 22:53:23 +00008681 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008682 PyErr_BadArgument();
8683 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008685
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008686 if (PyUnicode_READY(input) == -1)
8687 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008688 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008689 kind = PyUnicode_KIND(input);
8690 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008691
8692 if (size == 0) {
8693 Py_INCREF(input);
8694 return input;
8695 }
8696
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008697 /* allocate enough for a simple 1:1 translation without
8698 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008699 _PyUnicodeWriter_Init(&writer);
8700 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008701 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008702
Victor Stinner872b2912014-04-05 14:27:07 +02008703 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8704
8705 res = unicode_fast_translate(input, mapping, &writer, ignore);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008706 if (res < 0) {
8707 _PyUnicodeWriter_Dealloc(&writer);
8708 return NULL;
8709 }
8710 if (res == 1)
8711 return _PyUnicodeWriter_Finish(&writer);
8712
Victor Stinner89a76ab2014-04-05 11:44:04 +02008713 i = writer.pos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008714 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008715 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008716 int translate;
8717 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8718 Py_ssize_t newpos;
8719 /* startpos for collecting untranslatable chars */
8720 Py_ssize_t collstart;
8721 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008722 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008723
Victor Stinner1194ea02014-04-04 19:37:40 +02008724 ch = PyUnicode_READ(kind, data, i);
8725 translate = charmaptranslate_output(ch, mapping, &writer);
8726 if (translate < 0)
8727 goto onError;
8728
8729 if (translate != 0) {
8730 /* it worked => adjust input pointer */
8731 ++i;
8732 continue;
8733 }
8734
8735 /* untranslatable character */
8736 collstart = i;
8737 collend = i+1;
8738
8739 /* find all untranslatable characters */
8740 while (collend < size) {
8741 PyObject *x;
8742 ch = PyUnicode_READ(kind, data, collend);
8743 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008744 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008745 Py_XDECREF(x);
8746 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008747 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008748 ++collend;
8749 }
8750
8751 if (ignore) {
8752 i = collend;
8753 }
8754 else {
8755 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8756 reason, input, &exc,
8757 collstart, collend, &newpos);
8758 if (repunicode == NULL)
8759 goto onError;
8760 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008761 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008762 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008763 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008764 Py_DECREF(repunicode);
8765 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008766 }
8767 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008768 Py_XDECREF(exc);
8769 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02008770 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008771
Benjamin Peterson29060642009-01-31 22:14:21 +00008772 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02008773 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008774 Py_XDECREF(exc);
8775 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008776 return NULL;
8777}
8778
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008779/* Deprecated. Use PyUnicode_Translate instead. */
8780PyObject *
8781PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8782 Py_ssize_t size,
8783 PyObject *mapping,
8784 const char *errors)
8785{
Christian Heimes5f520f42012-09-11 14:03:25 +02008786 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008787 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8788 if (!unicode)
8789 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008790 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8791 Py_DECREF(unicode);
8792 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008793}
8794
Alexander Belopolsky40018472011-02-26 01:02:56 +00008795PyObject *
8796PyUnicode_Translate(PyObject *str,
8797 PyObject *mapping,
8798 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008799{
8800 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008801
Guido van Rossumd57fd912000-03-10 22:53:23 +00008802 str = PyUnicode_FromObject(str);
8803 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008804 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008805 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008806 Py_DECREF(str);
8807 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008808}
Tim Petersced69f82003-09-16 20:30:58 +00008809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008810static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008811fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008812{
8813 /* No need to call PyUnicode_READY(self) because this function is only
8814 called as a callback from fixup() which does it already. */
8815 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8816 const int kind = PyUnicode_KIND(self);
8817 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008818 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008819 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008820 Py_ssize_t i;
8821
8822 for (i = 0; i < len; ++i) {
8823 ch = PyUnicode_READ(kind, data, i);
8824 fixed = 0;
8825 if (ch > 127) {
8826 if (Py_UNICODE_ISSPACE(ch))
8827 fixed = ' ';
8828 else {
8829 const int decimal = Py_UNICODE_TODECIMAL(ch);
8830 if (decimal >= 0)
8831 fixed = '0' + decimal;
8832 }
8833 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008834 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008835 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008836 PyUnicode_WRITE(kind, data, i, fixed);
8837 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008838 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008839 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008840 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008841 }
8842
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008843 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008844}
8845
8846PyObject *
8847_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8848{
8849 if (!PyUnicode_Check(unicode)) {
8850 PyErr_BadInternalCall();
8851 return NULL;
8852 }
8853 if (PyUnicode_READY(unicode) == -1)
8854 return NULL;
8855 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8856 /* If the string is already ASCII, just return the same string */
8857 Py_INCREF(unicode);
8858 return unicode;
8859 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008860 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008861}
8862
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008863PyObject *
8864PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8865 Py_ssize_t length)
8866{
Victor Stinnerf0124502011-11-21 23:12:56 +01008867 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008868 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008869 Py_UCS4 maxchar;
8870 enum PyUnicode_Kind kind;
8871 void *data;
8872
Victor Stinner99d7ad02012-02-22 13:37:39 +01008873 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008874 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02008875 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008876 if (ch > 127) {
8877 int decimal = Py_UNICODE_TODECIMAL(ch);
8878 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008879 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008880 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008881 }
8882 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008883
8884 /* Copy to a new string */
8885 decimal = PyUnicode_New(length, maxchar);
8886 if (decimal == NULL)
8887 return decimal;
8888 kind = PyUnicode_KIND(decimal);
8889 data = PyUnicode_DATA(decimal);
8890 /* Iterate over code points */
8891 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02008892 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01008893 if (ch > 127) {
8894 int decimal = Py_UNICODE_TODECIMAL(ch);
8895 if (decimal >= 0)
8896 ch = '0' + decimal;
8897 }
8898 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008899 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008900 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008901}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008902/* --- Decimal Encoder ---------------------------------------------------- */
8903
Alexander Belopolsky40018472011-02-26 01:02:56 +00008904int
8905PyUnicode_EncodeDecimal(Py_UNICODE *s,
8906 Py_ssize_t length,
8907 char *output,
8908 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008909{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008910 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008911 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008912 enum PyUnicode_Kind kind;
8913 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008914
8915 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008916 PyErr_BadArgument();
8917 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008918 }
8919
Victor Stinner42bf7752011-11-21 22:52:58 +01008920 unicode = PyUnicode_FromUnicode(s, length);
8921 if (unicode == NULL)
8922 return -1;
8923
Benjamin Petersonbac79492012-01-14 13:34:47 -05008924 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008925 Py_DECREF(unicode);
8926 return -1;
8927 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008928 kind = PyUnicode_KIND(unicode);
8929 data = PyUnicode_DATA(unicode);
8930
Victor Stinnerb84d7232011-11-22 01:50:07 +01008931 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008932 PyObject *exc;
8933 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008934 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008935 Py_ssize_t startpos;
8936
8937 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008938
Benjamin Peterson29060642009-01-31 22:14:21 +00008939 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008940 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008941 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008942 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008943 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008944 decimal = Py_UNICODE_TODECIMAL(ch);
8945 if (decimal >= 0) {
8946 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008947 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008948 continue;
8949 }
8950 if (0 < ch && ch < 256) {
8951 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008952 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008953 continue;
8954 }
Victor Stinner6345be92011-11-25 20:09:01 +01008955
Victor Stinner42bf7752011-11-21 22:52:58 +01008956 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008957 exc = NULL;
8958 raise_encode_exception(&exc, "decimal", unicode,
8959 startpos, startpos+1,
8960 "invalid decimal Unicode string");
8961 Py_XDECREF(exc);
8962 Py_DECREF(unicode);
8963 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008964 }
8965 /* 0-terminate the output string */
8966 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008967 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008968 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008969}
8970
Guido van Rossumd57fd912000-03-10 22:53:23 +00008971/* --- Helpers ------------------------------------------------------------ */
8972
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008973static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008974any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008975 Py_ssize_t start,
8976 Py_ssize_t end)
8977{
8978 int kind1, kind2, kind;
8979 void *buf1, *buf2;
8980 Py_ssize_t len1, len2, result;
8981
8982 kind1 = PyUnicode_KIND(s1);
8983 kind2 = PyUnicode_KIND(s2);
8984 kind = kind1 > kind2 ? kind1 : kind2;
8985 buf1 = PyUnicode_DATA(s1);
8986 buf2 = PyUnicode_DATA(s2);
8987 if (kind1 != kind)
8988 buf1 = _PyUnicode_AsKind(s1, kind);
8989 if (!buf1)
8990 return -2;
8991 if (kind2 != kind)
8992 buf2 = _PyUnicode_AsKind(s2, kind);
8993 if (!buf2) {
8994 if (kind1 != kind) PyMem_Free(buf1);
8995 return -2;
8996 }
8997 len1 = PyUnicode_GET_LENGTH(s1);
8998 len2 = PyUnicode_GET_LENGTH(s2);
8999
Victor Stinner794d5672011-10-10 03:21:36 +02009000 if (direction > 0) {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009001 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009002 case PyUnicode_1BYTE_KIND:
9003 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9004 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9005 else
9006 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9007 break;
9008 case PyUnicode_2BYTE_KIND:
9009 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9010 break;
9011 case PyUnicode_4BYTE_KIND:
9012 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9013 break;
9014 default:
9015 assert(0); result = -2;
9016 }
9017 }
9018 else {
Benjamin Petersonead6b532011-12-20 17:23:42 -06009019 switch (kind) {
Victor Stinner794d5672011-10-10 03:21:36 +02009020 case PyUnicode_1BYTE_KIND:
9021 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9022 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9023 else
9024 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9025 break;
9026 case PyUnicode_2BYTE_KIND:
9027 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9028 break;
9029 case PyUnicode_4BYTE_KIND:
9030 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9031 break;
9032 default:
9033 assert(0); result = -2;
9034 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009035 }
9036
9037 if (kind1 != kind)
9038 PyMem_Free(buf1);
9039 if (kind2 != kind)
9040 PyMem_Free(buf2);
9041
9042 return result;
9043}
9044
9045Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009046_PyUnicode_InsertThousandsGrouping(
9047 PyObject *unicode, Py_ssize_t index,
9048 Py_ssize_t n_buffer,
9049 void *digits, Py_ssize_t n_digits,
9050 Py_ssize_t min_width,
9051 const char *grouping, PyObject *thousands_sep,
9052 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009053{
Victor Stinner41a863c2012-02-24 00:37:51 +01009054 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009055 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009056 Py_ssize_t thousands_sep_len;
9057 Py_ssize_t len;
9058
9059 if (unicode != NULL) {
9060 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009061 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009062 }
9063 else {
9064 kind = PyUnicode_1BYTE_KIND;
9065 data = NULL;
9066 }
9067 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9068 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9069 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9070 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009071 if (thousands_sep_kind < kind) {
9072 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9073 if (!thousands_sep_data)
9074 return -1;
9075 }
9076 else {
9077 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9078 if (!data)
9079 return -1;
9080 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009081 }
9082
Benjamin Petersonead6b532011-12-20 17:23:42 -06009083 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009084 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009085 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009086 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009087 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009088 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009089 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009090 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009091 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009092 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009093 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009094 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009095 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009096 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009097 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009098 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009099 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009100 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009101 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009102 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009103 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009104 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009105 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009106 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009107 break;
9108 default:
9109 assert(0);
9110 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009111 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009112 if (unicode != NULL && thousands_sep_kind != kind) {
9113 if (thousands_sep_kind < kind)
9114 PyMem_Free(thousands_sep_data);
9115 else
9116 PyMem_Free(data);
9117 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009118 if (unicode == NULL) {
9119 *maxchar = 127;
9120 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009121 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009122 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009123 }
9124 }
9125 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009126}
9127
9128
Thomas Wouters477c8d52006-05-27 19:21:47 +00009129/* helper macro to fixup start/end slice values */
Antoine Pitrouf2c54842010-01-13 08:07:53 +00009130#define ADJUST_INDICES(start, end, len) \
9131 if (end > len) \
9132 end = len; \
9133 else if (end < 0) { \
9134 end += len; \
9135 if (end < 0) \
9136 end = 0; \
9137 } \
9138 if (start < 0) { \
9139 start += len; \
9140 if (start < 0) \
9141 start = 0; \
9142 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009143
Alexander Belopolsky40018472011-02-26 01:02:56 +00009144Py_ssize_t
9145PyUnicode_Count(PyObject *str,
9146 PyObject *substr,
9147 Py_ssize_t start,
9148 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009149{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009150 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009151 PyObject* str_obj;
9152 PyObject* sub_obj;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009153 int kind1, kind2, kind;
9154 void *buf1 = NULL, *buf2 = NULL;
9155 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009156
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009157 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009158 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009159 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009160 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009161 if (!sub_obj) {
9162 Py_DECREF(str_obj);
9163 return -1;
9164 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009165 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009166 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009167 Py_DECREF(str_obj);
9168 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009169 }
Tim Petersced69f82003-09-16 20:30:58 +00009170
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009171 kind1 = PyUnicode_KIND(str_obj);
9172 kind2 = PyUnicode_KIND(sub_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009173 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009174 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009175 buf2 = PyUnicode_DATA(sub_obj);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009176 if (kind2 != kind) {
Antoine Pitrou758153b2012-05-12 15:51:51 +02009177 if (kind2 > kind) {
9178 Py_DECREF(sub_obj);
9179 Py_DECREF(str_obj);
Antoine Pitroue45c0c52012-05-12 15:49:07 +02009180 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +02009181 }
Victor Stinner7931d9a2011-11-04 00:22:48 +01009182 buf2 = _PyUnicode_AsKind(sub_obj, kind);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009183 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009184 if (!buf2)
9185 goto onError;
9186 len1 = PyUnicode_GET_LENGTH(str_obj);
9187 len2 = PyUnicode_GET_LENGTH(sub_obj);
9188
9189 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -06009190 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009191 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009192 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9193 result = asciilib_count(
9194 ((Py_UCS1*)buf1) + start, end - start,
9195 buf2, len2, PY_SSIZE_T_MAX
9196 );
9197 else
9198 result = ucs1lib_count(
9199 ((Py_UCS1*)buf1) + start, end - start,
9200 buf2, len2, PY_SSIZE_T_MAX
9201 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009202 break;
9203 case PyUnicode_2BYTE_KIND:
9204 result = ucs2lib_count(
9205 ((Py_UCS2*)buf1) + start, end - start,
9206 buf2, len2, PY_SSIZE_T_MAX
9207 );
9208 break;
9209 case PyUnicode_4BYTE_KIND:
9210 result = ucs4lib_count(
9211 ((Py_UCS4*)buf1) + start, end - start,
9212 buf2, len2, PY_SSIZE_T_MAX
9213 );
9214 break;
9215 default:
9216 assert(0); result = 0;
9217 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009218
9219 Py_DECREF(sub_obj);
9220 Py_DECREF(str_obj);
9221
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009222 if (kind2 != kind)
9223 PyMem_Free(buf2);
9224
Guido van Rossumd57fd912000-03-10 22:53:23 +00009225 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009226 onError:
9227 Py_DECREF(sub_obj);
9228 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009229 if (kind2 != kind && buf2)
9230 PyMem_Free(buf2);
9231 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009232}
9233
Alexander Belopolsky40018472011-02-26 01:02:56 +00009234Py_ssize_t
9235PyUnicode_Find(PyObject *str,
9236 PyObject *sub,
9237 Py_ssize_t start,
9238 Py_ssize_t end,
9239 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009240{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009241 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009242
Guido van Rossumd57fd912000-03-10 22:53:23 +00009243 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009244 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009245 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009246 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009247 if (!sub) {
9248 Py_DECREF(str);
9249 return -2;
9250 }
9251 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9252 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009253 Py_DECREF(str);
9254 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009255 }
Tim Petersced69f82003-09-16 20:30:58 +00009256
Victor Stinner794d5672011-10-10 03:21:36 +02009257 result = any_find_slice(direction,
9258 str, sub, start, end
9259 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009260
Guido van Rossumd57fd912000-03-10 22:53:23 +00009261 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009262 Py_DECREF(sub);
9263
Guido van Rossumd57fd912000-03-10 22:53:23 +00009264 return result;
9265}
9266
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009267Py_ssize_t
9268PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9269 Py_ssize_t start, Py_ssize_t end,
9270 int direction)
9271{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009272 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009273 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009274 if (PyUnicode_READY(str) == -1)
9275 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009276 if (start < 0 || end < 0) {
9277 PyErr_SetString(PyExc_IndexError, "string index out of range");
9278 return -2;
9279 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009280 if (end > PyUnicode_GET_LENGTH(str))
9281 end = PyUnicode_GET_LENGTH(str);
9282 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009283 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9284 kind, end-start, ch, direction);
9285 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009286 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009287 else
9288 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009289}
9290
Alexander Belopolsky40018472011-02-26 01:02:56 +00009291static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009292tailmatch(PyObject *self,
9293 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009294 Py_ssize_t start,
9295 Py_ssize_t end,
9296 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009297{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009298 int kind_self;
9299 int kind_sub;
9300 void *data_self;
9301 void *data_sub;
9302 Py_ssize_t offset;
9303 Py_ssize_t i;
9304 Py_ssize_t end_sub;
9305
9306 if (PyUnicode_READY(self) == -1 ||
9307 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009308 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009309
9310 if (PyUnicode_GET_LENGTH(substring) == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009311 return 1;
9312
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9314 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009315 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009316 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009318 kind_self = PyUnicode_KIND(self);
9319 data_self = PyUnicode_DATA(self);
9320 kind_sub = PyUnicode_KIND(substring);
9321 data_sub = PyUnicode_DATA(substring);
9322 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9323
9324 if (direction > 0)
9325 offset = end;
9326 else
9327 offset = start;
9328
9329 if (PyUnicode_READ(kind_self, data_self, offset) ==
9330 PyUnicode_READ(kind_sub, data_sub, 0) &&
9331 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9332 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9333 /* If both are of the same kind, memcmp is sufficient */
9334 if (kind_self == kind_sub) {
9335 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009336 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009337 data_sub,
9338 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009339 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009340 }
9341 /* otherwise we have to compare each character by first accesing it */
9342 else {
9343 /* We do not need to compare 0 and len(substring)-1 because
9344 the if statement above ensured already that they are equal
9345 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009346 for (i = 1; i < end_sub; ++i) {
9347 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9348 PyUnicode_READ(kind_sub, data_sub, i))
9349 return 0;
9350 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009351 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009352 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009353 }
9354
9355 return 0;
9356}
9357
Alexander Belopolsky40018472011-02-26 01:02:56 +00009358Py_ssize_t
9359PyUnicode_Tailmatch(PyObject *str,
9360 PyObject *substr,
9361 Py_ssize_t start,
9362 Py_ssize_t end,
9363 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009364{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009365 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009366
Guido van Rossumd57fd912000-03-10 22:53:23 +00009367 str = PyUnicode_FromObject(str);
9368 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009369 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009370 substr = PyUnicode_FromObject(substr);
9371 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009372 Py_DECREF(str);
9373 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009374 }
Tim Petersced69f82003-09-16 20:30:58 +00009375
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009376 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009377 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009378 Py_DECREF(str);
9379 Py_DECREF(substr);
9380 return result;
9381}
9382
Guido van Rossumd57fd912000-03-10 22:53:23 +00009383/* Apply fixfct filter to the Unicode object self and return a
9384 reference to the modified object */
9385
Alexander Belopolsky40018472011-02-26 01:02:56 +00009386static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009387fixup(PyObject *self,
9388 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009389{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009390 PyObject *u;
9391 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009392 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009393
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009394 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009395 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009396 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009397 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009398
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009399 /* fix functions return the new maximum character in a string,
9400 if the kind of the resulting unicode object does not change,
9401 everything is fine. Otherwise we need to change the string kind
9402 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009403 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009404
9405 if (maxchar_new == 0) {
9406 /* no changes */;
9407 if (PyUnicode_CheckExact(self)) {
9408 Py_DECREF(u);
9409 Py_INCREF(self);
9410 return self;
9411 }
9412 else
9413 return u;
9414 }
9415
Victor Stinnere6abb482012-05-02 01:15:40 +02009416 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009417
Victor Stinnereaab6042011-12-11 22:22:39 +01009418 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009419 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009420
9421 /* In case the maximum character changed, we need to
9422 convert the string to the new category. */
9423 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9424 if (v == NULL) {
9425 Py_DECREF(u);
9426 return NULL;
9427 }
9428 if (maxchar_new > maxchar_old) {
9429 /* If the maxchar increased so that the kind changed, not all
9430 characters are representable anymore and we need to fix the
9431 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009432 _PyUnicode_FastCopyCharacters(v, 0,
9433 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009434 maxchar_old = fixfct(v);
9435 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009436 }
9437 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009438 _PyUnicode_FastCopyCharacters(v, 0,
9439 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009440 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009441 Py_DECREF(u);
9442 assert(_PyUnicode_CheckConsistency(v, 1));
9443 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009444}
9445
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009446static PyObject *
9447ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009448{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009449 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9450 char *resdata, *data = PyUnicode_DATA(self);
9451 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009452
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009453 res = PyUnicode_New(len, 127);
9454 if (res == NULL)
9455 return NULL;
9456 resdata = PyUnicode_DATA(res);
9457 if (lower)
9458 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009459 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009460 _Py_bytes_upper(resdata, data, len);
9461 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009462}
9463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009465handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009466{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009467 Py_ssize_t j;
9468 int final_sigma;
9469 Py_UCS4 c;
9470 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009471
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009472 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9473
9474 where ! is a negation and \p{xxx} is a character with property xxx.
9475 */
9476 for (j = i - 1; j >= 0; j--) {
9477 c = PyUnicode_READ(kind, data, j);
9478 if (!_PyUnicode_IsCaseIgnorable(c))
9479 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009480 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009481 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9482 if (final_sigma) {
9483 for (j = i + 1; j < length; j++) {
9484 c = PyUnicode_READ(kind, data, j);
9485 if (!_PyUnicode_IsCaseIgnorable(c))
9486 break;
9487 }
9488 final_sigma = j == length || !_PyUnicode_IsCased(c);
9489 }
9490 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009491}
9492
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009493static int
9494lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9495 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009496{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009497 /* Obscure special case. */
9498 if (c == 0x3A3) {
9499 mapped[0] = handle_capital_sigma(kind, data, length, i);
9500 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009501 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009502 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009503}
9504
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009505static Py_ssize_t
9506do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009507{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009508 Py_ssize_t i, k = 0;
9509 int n_res, j;
9510 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009511
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009512 c = PyUnicode_READ(kind, data, 0);
9513 n_res = _PyUnicode_ToUpperFull(c, mapped);
9514 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009515 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009516 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009517 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009518 for (i = 1; i < length; i++) {
9519 c = PyUnicode_READ(kind, data, i);
9520 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9521 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009522 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009523 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009524 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009525 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009526 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009527}
9528
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009529static Py_ssize_t
9530do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9531 Py_ssize_t i, k = 0;
9532
9533 for (i = 0; i < length; i++) {
9534 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9535 int n_res, j;
9536 if (Py_UNICODE_ISUPPER(c)) {
9537 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9538 }
9539 else if (Py_UNICODE_ISLOWER(c)) {
9540 n_res = _PyUnicode_ToUpperFull(c, mapped);
9541 }
9542 else {
9543 n_res = 1;
9544 mapped[0] = c;
9545 }
9546 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009547 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009548 res[k++] = mapped[j];
9549 }
9550 }
9551 return k;
9552}
9553
9554static Py_ssize_t
9555do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9556 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009557{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009558 Py_ssize_t i, k = 0;
9559
9560 for (i = 0; i < length; i++) {
9561 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9562 int n_res, j;
9563 if (lower)
9564 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9565 else
9566 n_res = _PyUnicode_ToUpperFull(c, mapped);
9567 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009568 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009569 res[k++] = mapped[j];
9570 }
9571 }
9572 return k;
9573}
9574
9575static Py_ssize_t
9576do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9577{
9578 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9579}
9580
9581static Py_ssize_t
9582do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9583{
9584 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9585}
9586
Benjamin Petersone51757f2012-01-12 21:10:29 -05009587static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009588do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9589{
9590 Py_ssize_t i, k = 0;
9591
9592 for (i = 0; i < length; i++) {
9593 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9594 Py_UCS4 mapped[3];
9595 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9596 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009597 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009598 res[k++] = mapped[j];
9599 }
9600 }
9601 return k;
9602}
9603
9604static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009605do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9606{
9607 Py_ssize_t i, k = 0;
9608 int previous_is_cased;
9609
9610 previous_is_cased = 0;
9611 for (i = 0; i < length; i++) {
9612 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9613 Py_UCS4 mapped[3];
9614 int n_res, j;
9615
9616 if (previous_is_cased)
9617 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9618 else
9619 n_res = _PyUnicode_ToTitleFull(c, mapped);
9620
9621 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009622 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009623 res[k++] = mapped[j];
9624 }
9625
9626 previous_is_cased = _PyUnicode_IsCased(c);
9627 }
9628 return k;
9629}
9630
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009631static PyObject *
9632case_operation(PyObject *self,
9633 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9634{
9635 PyObject *res = NULL;
9636 Py_ssize_t length, newlength = 0;
9637 int kind, outkind;
9638 void *data, *outdata;
9639 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9640
Benjamin Petersoneea48462012-01-16 14:28:50 -05009641 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009642
9643 kind = PyUnicode_KIND(self);
9644 data = PyUnicode_DATA(self);
9645 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009646 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009647 PyErr_SetString(PyExc_OverflowError, "string is too long");
9648 return NULL;
9649 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009650 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009651 if (tmp == NULL)
9652 return PyErr_NoMemory();
9653 newlength = perform(kind, data, length, tmp, &maxchar);
9654 res = PyUnicode_New(newlength, maxchar);
9655 if (res == NULL)
9656 goto leave;
9657 tmpend = tmp + newlength;
9658 outdata = PyUnicode_DATA(res);
9659 outkind = PyUnicode_KIND(res);
9660 switch (outkind) {
9661 case PyUnicode_1BYTE_KIND:
9662 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9663 break;
9664 case PyUnicode_2BYTE_KIND:
9665 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9666 break;
9667 case PyUnicode_4BYTE_KIND:
9668 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9669 break;
9670 default:
9671 assert(0);
9672 break;
9673 }
9674 leave:
9675 PyMem_FREE(tmp);
9676 return res;
9677}
9678
Tim Peters8ce9f162004-08-27 01:49:32 +00009679PyObject *
9680PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009681{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009682 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009683 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009684 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009685 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009686 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9687 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009688 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009689 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009690 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009691 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009692 int use_memcpy;
9693 unsigned char *res_data = NULL, *sep_data = NULL;
9694 PyObject *last_obj;
9695 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009696
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009697 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009698 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009699 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009700 }
9701
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009702 /* NOTE: the following code can't call back into Python code,
9703 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009704 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009705
Tim Peters05eba1f2004-08-27 21:32:02 +00009706 seqlen = PySequence_Fast_GET_SIZE(fseq);
9707 /* If empty sequence, return u"". */
9708 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009709 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009710 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009711 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009712
Tim Peters05eba1f2004-08-27 21:32:02 +00009713 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009714 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009715 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009716 if (seqlen == 1) {
9717 if (PyUnicode_CheckExact(items[0])) {
9718 res = items[0];
9719 Py_INCREF(res);
9720 Py_DECREF(fseq);
9721 return res;
9722 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009723 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009724 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009725 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009726 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009727 /* Set up sep and seplen */
9728 if (separator == NULL) {
9729 /* fall back to a blank space separator */
9730 sep = PyUnicode_FromOrdinal(' ');
9731 if (!sep)
9732 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009733 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009734 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009735 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009736 else {
9737 if (!PyUnicode_Check(separator)) {
9738 PyErr_Format(PyExc_TypeError,
9739 "separator: expected str instance,"
9740 " %.80s found",
9741 Py_TYPE(separator)->tp_name);
9742 goto onError;
9743 }
9744 if (PyUnicode_READY(separator))
9745 goto onError;
9746 sep = separator;
9747 seplen = PyUnicode_GET_LENGTH(separator);
9748 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9749 /* inc refcount to keep this code path symmetric with the
9750 above case of a blank separator */
9751 Py_INCREF(sep);
9752 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009753 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009754 }
9755
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009756 /* There are at least two things to join, or else we have a subclass
9757 * of str in the sequence.
9758 * Do a pre-pass to figure out the total amount of space we'll
9759 * need (sz), and see whether all argument are strings.
9760 */
9761 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009762#ifdef Py_DEBUG
9763 use_memcpy = 0;
9764#else
9765 use_memcpy = 1;
9766#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009767 for (i = 0; i < seqlen; i++) {
9768 const Py_ssize_t old_sz = sz;
9769 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009770 if (!PyUnicode_Check(item)) {
9771 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009772 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009773 " %.80s found",
9774 i, Py_TYPE(item)->tp_name);
9775 goto onError;
9776 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009777 if (PyUnicode_READY(item) == -1)
9778 goto onError;
9779 sz += PyUnicode_GET_LENGTH(item);
9780 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009781 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009782 if (i != 0)
9783 sz += seplen;
9784 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9785 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009786 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009787 goto onError;
9788 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009789 if (use_memcpy && last_obj != NULL) {
9790 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9791 use_memcpy = 0;
9792 }
9793 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009794 }
Tim Petersced69f82003-09-16 20:30:58 +00009795
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009796 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009797 if (res == NULL)
9798 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009799
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009800 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009801#ifdef Py_DEBUG
9802 use_memcpy = 0;
9803#else
9804 if (use_memcpy) {
9805 res_data = PyUnicode_1BYTE_DATA(res);
9806 kind = PyUnicode_KIND(res);
9807 if (seplen != 0)
9808 sep_data = PyUnicode_1BYTE_DATA(sep);
9809 }
9810#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009811 if (use_memcpy) {
9812 for (i = 0; i < seqlen; ++i) {
9813 Py_ssize_t itemlen;
9814 item = items[i];
9815
9816 /* Copy item, and maybe the separator. */
9817 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009818 Py_MEMCPY(res_data,
9819 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009820 kind * seplen);
9821 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009822 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009823
9824 itemlen = PyUnicode_GET_LENGTH(item);
9825 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009826 Py_MEMCPY(res_data,
9827 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009828 kind * itemlen);
9829 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009830 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009831 }
9832 assert(res_data == PyUnicode_1BYTE_DATA(res)
9833 + kind * PyUnicode_GET_LENGTH(res));
9834 }
9835 else {
9836 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9837 Py_ssize_t itemlen;
9838 item = items[i];
9839
9840 /* Copy item, and maybe the separator. */
9841 if (i && seplen != 0) {
9842 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9843 res_offset += seplen;
9844 }
9845
9846 itemlen = PyUnicode_GET_LENGTH(item);
9847 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009848 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009849 res_offset += itemlen;
9850 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009851 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009852 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009853 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009854
Tim Peters05eba1f2004-08-27 21:32:02 +00009855 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009856 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009857 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009858 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009859
Benjamin Peterson29060642009-01-31 22:14:21 +00009860 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009861 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009862 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009863 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009864 return NULL;
9865}
9866
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867#define FILL(kind, data, value, start, length) \
9868 do { \
9869 Py_ssize_t i_ = 0; \
9870 assert(kind != PyUnicode_WCHAR_KIND); \
9871 switch ((kind)) { \
9872 case PyUnicode_1BYTE_KIND: { \
9873 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009874 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009875 break; \
9876 } \
9877 case PyUnicode_2BYTE_KIND: { \
9878 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9879 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9880 break; \
9881 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009882 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009883 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9884 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9885 break; \
9886 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +02009887 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009888 } \
9889 } while (0)
9890
Victor Stinnerd3f08822012-05-29 12:57:52 +02009891void
9892_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9893 Py_UCS4 fill_char)
9894{
9895 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9896 const void *data = PyUnicode_DATA(unicode);
9897 assert(PyUnicode_IS_READY(unicode));
9898 assert(unicode_modifiable(unicode));
9899 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9900 assert(start >= 0);
9901 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9902 FILL(kind, data, fill_char, start, length);
9903}
9904
Victor Stinner3fe55312012-01-04 00:33:50 +01009905Py_ssize_t
9906PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9907 Py_UCS4 fill_char)
9908{
9909 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009910
9911 if (!PyUnicode_Check(unicode)) {
9912 PyErr_BadInternalCall();
9913 return -1;
9914 }
9915 if (PyUnicode_READY(unicode) == -1)
9916 return -1;
9917 if (unicode_check_modifiable(unicode))
9918 return -1;
9919
Victor Stinnerd3f08822012-05-29 12:57:52 +02009920 if (start < 0) {
9921 PyErr_SetString(PyExc_IndexError, "string index out of range");
9922 return -1;
9923 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009924 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9925 PyErr_SetString(PyExc_ValueError,
9926 "fill character is bigger than "
9927 "the string maximum character");
9928 return -1;
9929 }
9930
9931 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9932 length = Py_MIN(maxlen, length);
9933 if (length <= 0)
9934 return 0;
9935
Victor Stinnerd3f08822012-05-29 12:57:52 +02009936 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009937 return length;
9938}
9939
Victor Stinner9310abb2011-10-05 00:59:23 +02009940static PyObject *
9941pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009942 Py_ssize_t left,
9943 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009944 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009945{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009946 PyObject *u;
9947 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009948 int kind;
9949 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009950
9951 if (left < 0)
9952 left = 0;
9953 if (right < 0)
9954 right = 0;
9955
Victor Stinnerc4b49542011-12-11 22:44:26 +01009956 if (left == 0 && right == 0)
9957 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009958
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009959 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9960 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009961 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9962 return NULL;
9963 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009964 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009965 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009966 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009967 if (!u)
9968 return NULL;
9969
9970 kind = PyUnicode_KIND(u);
9971 data = PyUnicode_DATA(u);
9972 if (left)
9973 FILL(kind, data, fill, 0, left);
9974 if (right)
9975 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009976 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009977 assert(_PyUnicode_CheckConsistency(u, 1));
9978 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009979}
9980
Alexander Belopolsky40018472011-02-26 01:02:56 +00009981PyObject *
9982PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009983{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009984 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009985
9986 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009987 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009988 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009989 if (PyUnicode_READY(string) == -1) {
9990 Py_DECREF(string);
9991 return NULL;
9992 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009993
Benjamin Petersonead6b532011-12-20 17:23:42 -06009994 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009995 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009996 if (PyUnicode_IS_ASCII(string))
9997 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009998 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009999 PyUnicode_GET_LENGTH(string), keepends);
10000 else
10001 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010002 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010003 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010004 break;
10005 case PyUnicode_2BYTE_KIND:
10006 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010007 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010008 PyUnicode_GET_LENGTH(string), keepends);
10009 break;
10010 case PyUnicode_4BYTE_KIND:
10011 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010012 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010013 PyUnicode_GET_LENGTH(string), keepends);
10014 break;
10015 default:
10016 assert(0);
10017 list = 0;
10018 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010019 Py_DECREF(string);
10020 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010021}
10022
Alexander Belopolsky40018472011-02-26 01:02:56 +000010023static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010024split(PyObject *self,
10025 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010026 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010027{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010028 int kind1, kind2, kind;
10029 void *buf1, *buf2;
10030 Py_ssize_t len1, len2;
10031 PyObject* out;
10032
Guido van Rossumd57fd912000-03-10 22:53:23 +000010033 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010034 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010036 if (PyUnicode_READY(self) == -1)
10037 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010038
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010039 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010040 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010041 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010042 if (PyUnicode_IS_ASCII(self))
10043 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010044 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010045 PyUnicode_GET_LENGTH(self), maxcount
10046 );
10047 else
10048 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010049 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010050 PyUnicode_GET_LENGTH(self), maxcount
10051 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010052 case PyUnicode_2BYTE_KIND:
10053 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010054 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010055 PyUnicode_GET_LENGTH(self), maxcount
10056 );
10057 case PyUnicode_4BYTE_KIND:
10058 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010059 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010060 PyUnicode_GET_LENGTH(self), maxcount
10061 );
10062 default:
10063 assert(0);
10064 return NULL;
10065 }
10066
10067 if (PyUnicode_READY(substring) == -1)
10068 return NULL;
10069
10070 kind1 = PyUnicode_KIND(self);
10071 kind2 = PyUnicode_KIND(substring);
10072 kind = kind1 > kind2 ? kind1 : kind2;
10073 buf1 = PyUnicode_DATA(self);
10074 buf2 = PyUnicode_DATA(substring);
10075 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010076 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 if (!buf1)
10078 return NULL;
10079 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010080 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010081 if (!buf2) {
10082 if (kind1 != kind) PyMem_Free(buf1);
10083 return NULL;
10084 }
10085 len1 = PyUnicode_GET_LENGTH(self);
10086 len2 = PyUnicode_GET_LENGTH(substring);
10087
Benjamin Petersonead6b532011-12-20 17:23:42 -060010088 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010089 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010090 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10091 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010092 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010093 else
10094 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010095 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010096 break;
10097 case PyUnicode_2BYTE_KIND:
10098 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010099 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010100 break;
10101 case PyUnicode_4BYTE_KIND:
10102 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010103 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010104 break;
10105 default:
10106 out = NULL;
10107 }
10108 if (kind1 != kind)
10109 PyMem_Free(buf1);
10110 if (kind2 != kind)
10111 PyMem_Free(buf2);
10112 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010113}
10114
Alexander Belopolsky40018472011-02-26 01:02:56 +000010115static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010116rsplit(PyObject *self,
10117 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010118 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010119{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010120 int kind1, kind2, kind;
10121 void *buf1, *buf2;
10122 Py_ssize_t len1, len2;
10123 PyObject* out;
10124
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010125 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010126 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 if (PyUnicode_READY(self) == -1)
10129 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010131 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010132 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010133 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010134 if (PyUnicode_IS_ASCII(self))
10135 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010136 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010137 PyUnicode_GET_LENGTH(self), maxcount
10138 );
10139 else
10140 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010141 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010142 PyUnicode_GET_LENGTH(self), maxcount
10143 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010144 case PyUnicode_2BYTE_KIND:
10145 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010146 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010147 PyUnicode_GET_LENGTH(self), maxcount
10148 );
10149 case PyUnicode_4BYTE_KIND:
10150 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010151 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010152 PyUnicode_GET_LENGTH(self), maxcount
10153 );
10154 default:
10155 assert(0);
10156 return NULL;
10157 }
10158
10159 if (PyUnicode_READY(substring) == -1)
10160 return NULL;
10161
10162 kind1 = PyUnicode_KIND(self);
10163 kind2 = PyUnicode_KIND(substring);
10164 kind = kind1 > kind2 ? kind1 : kind2;
10165 buf1 = PyUnicode_DATA(self);
10166 buf2 = PyUnicode_DATA(substring);
10167 if (kind1 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010168 buf1 = _PyUnicode_AsKind(self, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010169 if (!buf1)
10170 return NULL;
10171 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010010172 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010173 if (!buf2) {
10174 if (kind1 != kind) PyMem_Free(buf1);
10175 return NULL;
10176 }
10177 len1 = PyUnicode_GET_LENGTH(self);
10178 len2 = PyUnicode_GET_LENGTH(substring);
10179
Benjamin Petersonead6b532011-12-20 17:23:42 -060010180 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010182 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10183 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010184 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010185 else
10186 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010187 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010188 break;
10189 case PyUnicode_2BYTE_KIND:
10190 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010191 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010192 break;
10193 case PyUnicode_4BYTE_KIND:
10194 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010195 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010196 break;
10197 default:
10198 out = NULL;
10199 }
10200 if (kind1 != kind)
10201 PyMem_Free(buf1);
10202 if (kind2 != kind)
10203 PyMem_Free(buf2);
10204 return out;
10205}
10206
10207static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010208anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10209 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010211 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010212 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010213 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10214 return asciilib_find(buf1, len1, buf2, len2, offset);
10215 else
10216 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010217 case PyUnicode_2BYTE_KIND:
10218 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10219 case PyUnicode_4BYTE_KIND:
10220 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10221 }
10222 assert(0);
10223 return -1;
10224}
10225
10226static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010227anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10228 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010229{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010230 switch (kind) {
10231 case PyUnicode_1BYTE_KIND:
10232 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10233 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10234 else
10235 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10236 case PyUnicode_2BYTE_KIND:
10237 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10238 case PyUnicode_4BYTE_KIND:
10239 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10240 }
10241 assert(0);
10242 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010243}
10244
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010245static void
10246replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10247 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10248{
10249 int kind = PyUnicode_KIND(u);
10250 void *data = PyUnicode_DATA(u);
10251 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10252 if (kind == PyUnicode_1BYTE_KIND) {
10253 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10254 (Py_UCS1 *)data + len,
10255 u1, u2, maxcount);
10256 }
10257 else if (kind == PyUnicode_2BYTE_KIND) {
10258 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10259 (Py_UCS2 *)data + len,
10260 u1, u2, maxcount);
10261 }
10262 else {
10263 assert(kind == PyUnicode_4BYTE_KIND);
10264 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10265 (Py_UCS4 *)data + len,
10266 u1, u2, maxcount);
10267 }
10268}
10269
Alexander Belopolsky40018472011-02-26 01:02:56 +000010270static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010271replace(PyObject *self, PyObject *str1,
10272 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010273{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010274 PyObject *u;
10275 char *sbuf = PyUnicode_DATA(self);
10276 char *buf1 = PyUnicode_DATA(str1);
10277 char *buf2 = PyUnicode_DATA(str2);
10278 int srelease = 0, release1 = 0, release2 = 0;
10279 int skind = PyUnicode_KIND(self);
10280 int kind1 = PyUnicode_KIND(str1);
10281 int kind2 = PyUnicode_KIND(str2);
10282 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10283 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10284 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010285 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010286 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010287
10288 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010289 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010291 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010292
Victor Stinner59de0ee2011-10-07 10:01:28 +020010293 if (str1 == str2)
10294 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010295
Victor Stinner49a0a212011-10-12 23:46:10 +020010296 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010297 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10298 if (maxchar < maxchar_str1)
10299 /* substring too wide to be present */
10300 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010301 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10302 /* Replacing str1 with str2 may cause a maxchar reduction in the
10303 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010304 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010305 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010306
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010307 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010308 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010310 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010312 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010313 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010314 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010315
Victor Stinner69ed0f42013-04-09 21:48:24 +020010316 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010317 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010318 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010319 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010320 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010321 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010322 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010323 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010324
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010325 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10326 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010327 }
10328 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010329 int rkind = skind;
10330 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010331 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010333 if (kind1 < rkind) {
10334 /* widen substring */
10335 buf1 = _PyUnicode_AsKind(str1, rkind);
10336 if (!buf1) goto error;
10337 release1 = 1;
10338 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010339 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010340 if (i < 0)
10341 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 if (rkind > kind2) {
10343 /* widen replacement */
10344 buf2 = _PyUnicode_AsKind(str2, rkind);
10345 if (!buf2) goto error;
10346 release2 = 1;
10347 }
10348 else if (rkind < kind2) {
10349 /* widen self and buf1 */
10350 rkind = kind2;
10351 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010352 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353 sbuf = _PyUnicode_AsKind(self, rkind);
10354 if (!sbuf) goto error;
10355 srelease = 1;
10356 buf1 = _PyUnicode_AsKind(str1, rkind);
10357 if (!buf1) goto error;
10358 release1 = 1;
10359 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010360 u = PyUnicode_New(slen, maxchar);
10361 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010362 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010363 assert(PyUnicode_KIND(u) == rkind);
10364 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010365
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010366 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010367 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010368 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010369 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010370 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010372
10373 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010374 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010375 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010376 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010377 if (i == -1)
10378 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010379 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010381 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010383 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010384 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010385 }
10386 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010388 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 int rkind = skind;
10390 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010391
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010392 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010393 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 buf1 = _PyUnicode_AsKind(str1, rkind);
10395 if (!buf1) goto error;
10396 release1 = 1;
10397 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010398 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010399 if (n == 0)
10400 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010401 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010402 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010403 buf2 = _PyUnicode_AsKind(str2, rkind);
10404 if (!buf2) goto error;
10405 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010406 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010408 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010409 rkind = kind2;
10410 sbuf = _PyUnicode_AsKind(self, rkind);
10411 if (!sbuf) goto error;
10412 srelease = 1;
10413 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010414 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415 buf1 = _PyUnicode_AsKind(str1, rkind);
10416 if (!buf1) goto error;
10417 release1 = 1;
10418 }
10419 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10420 PyUnicode_GET_LENGTH(str1))); */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010421 if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010422 PyErr_SetString(PyExc_OverflowError,
10423 "replace string is too long");
10424 goto error;
10425 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010426 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010427 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010428 _Py_INCREF_UNICODE_EMPTY();
10429 if (!unicode_empty)
10430 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010431 u = unicode_empty;
10432 goto done;
10433 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010434 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010435 PyErr_SetString(PyExc_OverflowError,
10436 "replace string is too long");
10437 goto error;
10438 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010439 u = PyUnicode_New(new_size, maxchar);
10440 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010442 assert(PyUnicode_KIND(u) == rkind);
10443 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010444 ires = i = 0;
10445 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010446 while (n-- > 0) {
10447 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010448 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010449 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010450 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010451 if (j == -1)
10452 break;
10453 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010454 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010455 memcpy(res + rkind * ires,
10456 sbuf + rkind * i,
10457 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010458 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010459 }
10460 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010461 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010462 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010463 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010464 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010465 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010466 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010467 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010468 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010469 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010470 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010471 memcpy(res + rkind * ires,
10472 sbuf + rkind * i,
10473 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010474 }
10475 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010476 /* interleave */
10477 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010478 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010479 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010480 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010481 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010482 if (--n <= 0)
10483 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010484 memcpy(res + rkind * ires,
10485 sbuf + rkind * i,
10486 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 ires++;
10488 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010489 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010490 memcpy(res + rkind * ires,
10491 sbuf + rkind * i,
10492 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010493 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010494 }
10495
10496 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010497 unicode_adjust_maxchar(&u);
10498 if (u == NULL)
10499 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010500 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010501
10502 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010503 if (srelease)
10504 PyMem_FREE(sbuf);
10505 if (release1)
10506 PyMem_FREE(buf1);
10507 if (release2)
10508 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010509 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010511
Benjamin Peterson29060642009-01-31 22:14:21 +000010512 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010513 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010514 if (srelease)
10515 PyMem_FREE(sbuf);
10516 if (release1)
10517 PyMem_FREE(buf1);
10518 if (release2)
10519 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010520 return unicode_result_unchanged(self);
10521
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 error:
10523 if (srelease && sbuf)
10524 PyMem_FREE(sbuf);
10525 if (release1 && buf1)
10526 PyMem_FREE(buf1);
10527 if (release2 && buf2)
10528 PyMem_FREE(buf2);
10529 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010530}
10531
10532/* --- Unicode Object Methods --------------------------------------------- */
10533
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010534PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010535 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010536\n\
10537Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010538characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010539
10540static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010541unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010542{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010543 if (PyUnicode_READY(self) == -1)
10544 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010545 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010546}
10547
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010548PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010549 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010550\n\
10551Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010552have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010553
10554static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010555unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010556{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010557 if (PyUnicode_READY(self) == -1)
10558 return NULL;
10559 if (PyUnicode_GET_LENGTH(self) == 0)
10560 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010561 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010562}
10563
Benjamin Petersond5890c82012-01-14 13:23:30 -050010564PyDoc_STRVAR(casefold__doc__,
10565 "S.casefold() -> str\n\
10566\n\
10567Return a version of S suitable for caseless comparisons.");
10568
10569static PyObject *
10570unicode_casefold(PyObject *self)
10571{
10572 if (PyUnicode_READY(self) == -1)
10573 return NULL;
10574 if (PyUnicode_IS_ASCII(self))
10575 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010576 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010577}
10578
10579
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010580/* Argument converter. Coerces to a single unicode character */
10581
10582static int
10583convert_uc(PyObject *obj, void *addr)
10584{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010586 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010587
Benjamin Peterson14339b62009-01-31 16:36:08 +000010588 uniobj = PyUnicode_FromObject(obj);
10589 if (uniobj == NULL) {
10590 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010591 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010592 return 0;
10593 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010595 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010596 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010597 Py_DECREF(uniobj);
10598 return 0;
10599 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010601 Py_DECREF(uniobj);
10602 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010603}
10604
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010605PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010606 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010607\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010608Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010609done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010610
10611static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010612unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010613{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010614 Py_ssize_t marg, left;
10615 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 Py_UCS4 fillchar = ' ';
10617
Victor Stinnere9a29352011-10-01 02:14:59 +020010618 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010620
Benjamin Petersonbac79492012-01-14 13:34:47 -050010621 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010622 return NULL;
10623
Victor Stinnerc4b49542011-12-11 22:44:26 +010010624 if (PyUnicode_GET_LENGTH(self) >= width)
10625 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010626
Victor Stinnerc4b49542011-12-11 22:44:26 +010010627 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010628 left = marg / 2 + (marg & width & 1);
10629
Victor Stinner9310abb2011-10-05 00:59:23 +020010630 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010631}
10632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010633/* This function assumes that str1 and str2 are readied by the caller. */
10634
Marc-André Lemburge5034372000-08-08 08:04:29 +000010635static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010636unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010637{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010638#define COMPARE(TYPE1, TYPE2) \
10639 do { \
10640 TYPE1* p1 = (TYPE1 *)data1; \
10641 TYPE2* p2 = (TYPE2 *)data2; \
10642 TYPE1* end = p1 + len; \
10643 Py_UCS4 c1, c2; \
10644 for (; p1 != end; p1++, p2++) { \
10645 c1 = *p1; \
10646 c2 = *p2; \
10647 if (c1 != c2) \
10648 return (c1 < c2) ? -1 : 1; \
10649 } \
10650 } \
10651 while (0)
10652
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 int kind1, kind2;
10654 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010655 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010656
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 kind1 = PyUnicode_KIND(str1);
10658 kind2 = PyUnicode_KIND(str2);
10659 data1 = PyUnicode_DATA(str1);
10660 data2 = PyUnicode_DATA(str2);
10661 len1 = PyUnicode_GET_LENGTH(str1);
10662 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010663 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010664
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010665 switch(kind1) {
10666 case PyUnicode_1BYTE_KIND:
10667 {
10668 switch(kind2) {
10669 case PyUnicode_1BYTE_KIND:
10670 {
10671 int cmp = memcmp(data1, data2, len);
10672 /* normalize result of memcmp() into the range [-1; 1] */
10673 if (cmp < 0)
10674 return -1;
10675 if (cmp > 0)
10676 return 1;
10677 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010678 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010679 case PyUnicode_2BYTE_KIND:
10680 COMPARE(Py_UCS1, Py_UCS2);
10681 break;
10682 case PyUnicode_4BYTE_KIND:
10683 COMPARE(Py_UCS1, Py_UCS4);
10684 break;
10685 default:
10686 assert(0);
10687 }
10688 break;
10689 }
10690 case PyUnicode_2BYTE_KIND:
10691 {
10692 switch(kind2) {
10693 case PyUnicode_1BYTE_KIND:
10694 COMPARE(Py_UCS2, Py_UCS1);
10695 break;
10696 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010697 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010698 COMPARE(Py_UCS2, Py_UCS2);
10699 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010700 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010701 case PyUnicode_4BYTE_KIND:
10702 COMPARE(Py_UCS2, Py_UCS4);
10703 break;
10704 default:
10705 assert(0);
10706 }
10707 break;
10708 }
10709 case PyUnicode_4BYTE_KIND:
10710 {
10711 switch(kind2) {
10712 case PyUnicode_1BYTE_KIND:
10713 COMPARE(Py_UCS4, Py_UCS1);
10714 break;
10715 case PyUnicode_2BYTE_KIND:
10716 COMPARE(Py_UCS4, Py_UCS2);
10717 break;
10718 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010719 {
10720#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10721 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10722 /* normalize result of wmemcmp() into the range [-1; 1] */
10723 if (cmp < 0)
10724 return -1;
10725 if (cmp > 0)
10726 return 1;
10727#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010728 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010729#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010730 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010731 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010732 default:
10733 assert(0);
10734 }
10735 break;
10736 }
10737 default:
10738 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010739 }
10740
Victor Stinner770e19e2012-10-04 22:59:45 +020010741 if (len1 == len2)
10742 return 0;
10743 if (len1 < len2)
10744 return -1;
10745 else
10746 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010747
10748#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010749}
10750
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010751Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010752unicode_compare_eq(PyObject *str1, PyObject *str2)
10753{
10754 int kind;
10755 void *data1, *data2;
10756 Py_ssize_t len;
10757 int cmp;
10758
Victor Stinnere5567ad2012-10-23 02:48:49 +020010759 len = PyUnicode_GET_LENGTH(str1);
10760 if (PyUnicode_GET_LENGTH(str2) != len)
10761 return 0;
10762 kind = PyUnicode_KIND(str1);
10763 if (PyUnicode_KIND(str2) != kind)
10764 return 0;
10765 data1 = PyUnicode_DATA(str1);
10766 data2 = PyUnicode_DATA(str2);
10767
10768 cmp = memcmp(data1, data2, len * kind);
10769 return (cmp == 0);
10770}
10771
10772
Alexander Belopolsky40018472011-02-26 01:02:56 +000010773int
10774PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010775{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10777 if (PyUnicode_READY(left) == -1 ||
10778 PyUnicode_READY(right) == -1)
10779 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010780
10781 /* a string is equal to itself */
10782 if (left == right)
10783 return 0;
10784
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010785 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010786 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010787 PyErr_Format(PyExc_TypeError,
10788 "Can't compare %.100s and %.100s",
10789 left->ob_type->tp_name,
10790 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010791 return -1;
10792}
10793
Martin v. Löwis5b222132007-06-10 09:51:05 +000010794int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010795_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10796{
10797 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10798 if (right_str == NULL)
10799 return -1;
10800 return PyUnicode_Compare(left, right_str);
10801}
10802
10803int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010804PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10805{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010806 Py_ssize_t i;
10807 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010808 Py_UCS4 chr;
10809
Victor Stinner910337b2011-10-03 03:20:16 +020010810 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010811 if (PyUnicode_READY(uni) == -1)
10812 return -1;
10813 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010814 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010815 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010816 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010817 size_t len, len2 = strlen(str);
10818 int cmp;
10819
10820 len = Py_MIN(len1, len2);
10821 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010822 if (cmp != 0) {
10823 if (cmp < 0)
10824 return -1;
10825 else
10826 return 1;
10827 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010828 if (len1 > len2)
10829 return 1; /* uni is longer */
10830 if (len2 > len1)
10831 return -1; /* str is longer */
10832 return 0;
10833 }
10834 else {
10835 void *data = PyUnicode_DATA(uni);
10836 /* Compare Unicode string and source character set string */
10837 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020010838 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010010839 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10840 /* This check keeps Python strings that end in '\0' from comparing equal
10841 to C strings identical up to that point. */
10842 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10843 return 1; /* uni is longer */
10844 if (str[i])
10845 return -1; /* str is longer */
10846 return 0;
10847 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010848}
10849
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010850
Benjamin Peterson29060642009-01-31 22:14:21 +000010851#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010852 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010853
Alexander Belopolsky40018472011-02-26 01:02:56 +000010854PyObject *
10855PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010856{
10857 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010858 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010859
Victor Stinnere5567ad2012-10-23 02:48:49 +020010860 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10861 Py_RETURN_NOTIMPLEMENTED;
10862
10863 if (PyUnicode_READY(left) == -1 ||
10864 PyUnicode_READY(right) == -1)
10865 return NULL;
10866
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010010867 if (left == right) {
10868 switch (op) {
10869 case Py_EQ:
10870 case Py_LE:
10871 case Py_GE:
10872 /* a string is equal to itself */
10873 v = Py_True;
10874 break;
10875 case Py_NE:
10876 case Py_LT:
10877 case Py_GT:
10878 v = Py_False;
10879 break;
10880 default:
10881 PyErr_BadArgument();
10882 return NULL;
10883 }
10884 }
10885 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020010886 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010887 result ^= (op == Py_NE);
10888 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020010889 }
10890 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010891 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010892
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010893 /* Convert the return value to a Boolean */
10894 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010895 case Py_LE:
10896 v = TEST_COND(result <= 0);
10897 break;
10898 case Py_GE:
10899 v = TEST_COND(result >= 0);
10900 break;
10901 case Py_LT:
10902 v = TEST_COND(result == -1);
10903 break;
10904 case Py_GT:
10905 v = TEST_COND(result == 1);
10906 break;
10907 default:
10908 PyErr_BadArgument();
10909 return NULL;
10910 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010911 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010912 Py_INCREF(v);
10913 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010914}
10915
Alexander Belopolsky40018472011-02-26 01:02:56 +000010916int
10917PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010918{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010919 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020010920 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010921 void *buf1, *buf2;
10922 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010923 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010924
10925 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010926 sub = PyUnicode_FromObject(element);
10927 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010928 PyErr_Format(PyExc_TypeError,
10929 "'in <string>' requires string as left operand, not %s",
10930 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010931 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010932 }
10933
Thomas Wouters477c8d52006-05-27 19:21:47 +000010934 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010935 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010936 Py_DECREF(sub);
10937 return -1;
10938 }
10939
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010940 kind1 = PyUnicode_KIND(str);
10941 kind2 = PyUnicode_KIND(sub);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010942 buf1 = PyUnicode_DATA(str);
10943 buf2 = PyUnicode_DATA(sub);
Victor Stinner77282cb2013-04-14 19:22:47 +020010944 if (kind2 != kind1) {
10945 if (kind2 > kind1) {
Antoine Pitrou758153b2012-05-12 15:51:51 +020010946 Py_DECREF(sub);
10947 Py_DECREF(str);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010948 return 0;
Antoine Pitrou758153b2012-05-12 15:51:51 +020010949 }
Victor Stinner77282cb2013-04-14 19:22:47 +020010950 buf2 = _PyUnicode_AsKind(sub, kind1);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010951 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010952 if (!buf2) {
10953 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010954 Py_DECREF(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010955 return -1;
10956 }
10957 len1 = PyUnicode_GET_LENGTH(str);
10958 len2 = PyUnicode_GET_LENGTH(sub);
10959
Victor Stinner77282cb2013-04-14 19:22:47 +020010960 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010961 case PyUnicode_1BYTE_KIND:
10962 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10963 break;
10964 case PyUnicode_2BYTE_KIND:
10965 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10966 break;
10967 case PyUnicode_4BYTE_KIND:
10968 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10969 break;
10970 default:
10971 result = -1;
10972 assert(0);
10973 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010974
10975 Py_DECREF(str);
10976 Py_DECREF(sub);
10977
Victor Stinner77282cb2013-04-14 19:22:47 +020010978 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010979 PyMem_Free(buf2);
10980
Guido van Rossum403d68b2000-03-13 15:55:09 +000010981 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010982}
10983
Guido van Rossumd57fd912000-03-10 22:53:23 +000010984/* Concat to string or Unicode object giving a new Unicode object. */
10985
Alexander Belopolsky40018472011-02-26 01:02:56 +000010986PyObject *
10987PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010988{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010989 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010990 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010991 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992
10993 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010994 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010995 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010996 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010997 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010998 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010999 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011000
11001 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020011002 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011003 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011004 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011005 }
Victor Stinnera464fc12011-10-02 20:39:30 +020011006 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011007 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011008 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011009 }
11010
Victor Stinner488fa492011-12-12 00:01:39 +010011011 u_len = PyUnicode_GET_LENGTH(u);
11012 v_len = PyUnicode_GET_LENGTH(v);
11013 if (u_len > PY_SSIZE_T_MAX - v_len) {
11014 PyErr_SetString(PyExc_OverflowError,
11015 "strings are too large to concat");
11016 goto onError;
11017 }
11018 new_len = u_len + v_len;
11019
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011020 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011021 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011022 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011023
Guido van Rossumd57fd912000-03-10 22:53:23 +000011024 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011025 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011026 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011027 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011028 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
11029 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011030 Py_DECREF(u);
11031 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011032 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011033 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011034
Benjamin Peterson29060642009-01-31 22:14:21 +000011035 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011036 Py_XDECREF(u);
11037 Py_XDECREF(v);
11038 return NULL;
11039}
11040
Walter Dörwald1ab83302007-05-18 17:15:44 +000011041void
Victor Stinner23e56682011-10-03 03:54:37 +020011042PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011043{
Victor Stinner23e56682011-10-03 03:54:37 +020011044 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011045 Py_UCS4 maxchar, maxchar2;
11046 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011047
11048 if (p_left == NULL) {
11049 if (!PyErr_Occurred())
11050 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011051 return;
11052 }
Victor Stinner23e56682011-10-03 03:54:37 +020011053 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011054 if (right == NULL || left == NULL
11055 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011056 if (!PyErr_Occurred())
11057 PyErr_BadInternalCall();
11058 goto error;
11059 }
11060
Benjamin Petersonbac79492012-01-14 13:34:47 -050011061 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011062 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011063 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011064 goto error;
11065
Victor Stinner488fa492011-12-12 00:01:39 +010011066 /* Shortcuts */
11067 if (left == unicode_empty) {
11068 Py_DECREF(left);
11069 Py_INCREF(right);
11070 *p_left = right;
11071 return;
11072 }
11073 if (right == unicode_empty)
11074 return;
11075
11076 left_len = PyUnicode_GET_LENGTH(left);
11077 right_len = PyUnicode_GET_LENGTH(right);
11078 if (left_len > PY_SSIZE_T_MAX - right_len) {
11079 PyErr_SetString(PyExc_OverflowError,
11080 "strings are too large to concat");
11081 goto error;
11082 }
11083 new_len = left_len + right_len;
11084
11085 if (unicode_modifiable(left)
11086 && PyUnicode_CheckExact(right)
11087 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011088 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11089 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011090 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011091 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011092 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11093 {
11094 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011095 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011096 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011097
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011098 /* copy 'right' into the newly allocated area of 'left' */
11099 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011100 }
Victor Stinner488fa492011-12-12 00:01:39 +010011101 else {
11102 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11103 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011104 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011105
Victor Stinner488fa492011-12-12 00:01:39 +010011106 /* Concat the two Unicode strings */
11107 res = PyUnicode_New(new_len, maxchar);
11108 if (res == NULL)
11109 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011110 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11111 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011112 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011113 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011114 }
11115 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011116 return;
11117
11118error:
Victor Stinner488fa492011-12-12 00:01:39 +010011119 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011120}
11121
11122void
11123PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11124{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011125 PyUnicode_Append(pleft, right);
11126 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011127}
11128
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011129PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011130 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011131\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011132Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011133string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011134interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135
11136static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011137unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011138{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011139 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011140 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011141 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011142 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011143 int kind1, kind2, kind;
11144 void *buf1, *buf2;
11145 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011146
Jesus Ceaac451502011-04-20 17:09:23 +020011147 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11148 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011149 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011150
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011151 kind1 = PyUnicode_KIND(self);
11152 kind2 = PyUnicode_KIND(substring);
Christian Heimesd47802e2013-06-29 21:33:36 +020011153 if (kind2 > kind1) {
11154 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011155 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020011156 }
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011157 kind = kind1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011158 buf1 = PyUnicode_DATA(self);
11159 buf2 = PyUnicode_DATA(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011160 if (kind2 != kind)
Victor Stinner7931d9a2011-11-04 00:22:48 +010011161 buf2 = _PyUnicode_AsKind(substring, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011162 if (!buf2) {
11163 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011164 return NULL;
11165 }
11166 len1 = PyUnicode_GET_LENGTH(self);
11167 len2 = PyUnicode_GET_LENGTH(substring);
11168
11169 ADJUST_INDICES(start, end, len1);
Benjamin Petersonead6b532011-12-20 17:23:42 -060011170 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011171 case PyUnicode_1BYTE_KIND:
11172 iresult = ucs1lib_count(
11173 ((Py_UCS1*)buf1) + start, end - start,
11174 buf2, len2, PY_SSIZE_T_MAX
11175 );
11176 break;
11177 case PyUnicode_2BYTE_KIND:
11178 iresult = ucs2lib_count(
11179 ((Py_UCS2*)buf1) + start, end - start,
11180 buf2, len2, PY_SSIZE_T_MAX
11181 );
11182 break;
11183 case PyUnicode_4BYTE_KIND:
11184 iresult = ucs4lib_count(
11185 ((Py_UCS4*)buf1) + start, end - start,
11186 buf2, len2, PY_SSIZE_T_MAX
11187 );
11188 break;
11189 default:
11190 assert(0); iresult = 0;
11191 }
11192
11193 result = PyLong_FromSsize_t(iresult);
11194
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011195 if (kind2 != kind)
11196 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011197
11198 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011199
Guido van Rossumd57fd912000-03-10 22:53:23 +000011200 return result;
11201}
11202
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011203PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011204 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011205\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011206Encode S using the codec registered for encoding. Default encoding\n\
11207is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011208handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011209a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11210'xmlcharrefreplace' as well as any other name registered with\n\
11211codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212
11213static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011214unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011215{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011216 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011217 char *encoding = NULL;
11218 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011219
Benjamin Peterson308d6372009-09-18 21:42:35 +000011220 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11221 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011222 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011223 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011224}
11225
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011226PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011227 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228\n\
11229Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011230If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011231
11232static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011233unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011234{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011235 Py_ssize_t i, j, line_pos, src_len, incr;
11236 Py_UCS4 ch;
11237 PyObject *u;
11238 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011239 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011240 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011241 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011242 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011243
Ezio Melotti745d54d2013-11-16 19:10:57 +020011244 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11245 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011246 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011247
Antoine Pitrou22425222011-10-04 19:10:51 +020011248 if (PyUnicode_READY(self) == -1)
11249 return NULL;
11250
Thomas Wouters7e474022000-07-16 12:04:32 +000011251 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011252 src_len = PyUnicode_GET_LENGTH(self);
11253 i = j = line_pos = 0;
11254 kind = PyUnicode_KIND(self);
11255 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011256 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011257 for (; i < src_len; i++) {
11258 ch = PyUnicode_READ(kind, src_data, i);
11259 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011260 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011261 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011262 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011263 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011264 goto overflow;
11265 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011266 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011267 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011268 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011269 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011270 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011271 goto overflow;
11272 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011273 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011274 if (ch == '\n' || ch == '\r')
11275 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011277 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011278 if (!found)
11279 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011280
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011282 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011283 if (!u)
11284 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011285 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011286
Antoine Pitroue71d5742011-10-04 15:55:09 +020011287 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011288
Antoine Pitroue71d5742011-10-04 15:55:09 +020011289 for (; i < src_len; i++) {
11290 ch = PyUnicode_READ(kind, src_data, i);
11291 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011292 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011293 incr = tabsize - (line_pos % tabsize);
11294 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011295 FILL(kind, dest_data, ' ', j, incr);
11296 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011297 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011298 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011299 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011300 line_pos++;
11301 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011302 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011303 if (ch == '\n' || ch == '\r')
11304 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011305 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011306 }
11307 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011308 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011309
Antoine Pitroue71d5742011-10-04 15:55:09 +020011310 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011311 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11312 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011313}
11314
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011315PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011316 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317\n\
11318Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011319such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011320arguments start and end are interpreted as in slice notation.\n\
11321\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011322Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323
11324static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011325unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011326{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011327 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011328 Py_ssize_t start;
11329 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011330 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011331
Jesus Ceaac451502011-04-20 17:09:23 +020011332 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11333 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011334 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011335
Christian Heimesd47802e2013-06-29 21:33:36 +020011336 if (PyUnicode_READY(self) == -1) {
11337 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011338 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011339 }
11340 if (PyUnicode_READY(substring) == -1) {
11341 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011342 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011343 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011344
Victor Stinner7931d9a2011-11-04 00:22:48 +010011345 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011346
11347 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011348
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011349 if (result == -2)
11350 return NULL;
11351
Christian Heimes217cfd12007-12-02 14:31:20 +000011352 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011353}
11354
11355static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011356unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011357{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011358 void *data;
11359 enum PyUnicode_Kind kind;
11360 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011361
11362 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11363 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011364 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011365 }
11366 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11367 PyErr_SetString(PyExc_IndexError, "string index out of range");
11368 return NULL;
11369 }
11370 kind = PyUnicode_KIND(self);
11371 data = PyUnicode_DATA(self);
11372 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011373 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011374}
11375
Guido van Rossumc2504932007-09-18 19:42:40 +000011376/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011377 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011378static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011379unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011380{
Guido van Rossumc2504932007-09-18 19:42:40 +000011381 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011382 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011383
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011384#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011385 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011386#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011387 if (_PyUnicode_HASH(self) != -1)
11388 return _PyUnicode_HASH(self);
11389 if (PyUnicode_READY(self) == -1)
11390 return -1;
11391 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011392 /*
11393 We make the hash of the empty string be 0, rather than using
11394 (prefix ^ suffix), since this slightly obfuscates the hash secret
11395 */
11396 if (len == 0) {
11397 _PyUnicode_HASH(self) = 0;
11398 return 0;
11399 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011400 x = _Py_HashBytes(PyUnicode_DATA(self),
11401 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011402 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011403 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011404}
11405
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011406PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011407 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011408\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011409Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011410
11411static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011412unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011413{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011414 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011415 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000011416 Py_ssize_t start;
11417 Py_ssize_t end;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418
Jesus Ceaac451502011-04-20 17:09:23 +020011419 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11420 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011421 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011422
Christian Heimesd47a0452013-06-29 21:21:37 +020011423 if (PyUnicode_READY(self) == -1) {
11424 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011425 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011426 }
11427 if (PyUnicode_READY(substring) == -1) {
11428 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011429 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011430 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011431
Victor Stinner7931d9a2011-11-04 00:22:48 +010011432 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011433
11434 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011435
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011436 if (result == -2)
11437 return NULL;
11438
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439 if (result < 0) {
11440 PyErr_SetString(PyExc_ValueError, "substring not found");
11441 return NULL;
11442 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011443
Christian Heimes217cfd12007-12-02 14:31:20 +000011444 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445}
11446
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011447PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011448 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011450Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011451at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011452
11453static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011454unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011455{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011456 Py_ssize_t i, length;
11457 int kind;
11458 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459 int cased;
11460
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011461 if (PyUnicode_READY(self) == -1)
11462 return NULL;
11463 length = PyUnicode_GET_LENGTH(self);
11464 kind = PyUnicode_KIND(self);
11465 data = PyUnicode_DATA(self);
11466
Guido van Rossumd57fd912000-03-10 22:53:23 +000011467 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011468 if (length == 1)
11469 return PyBool_FromLong(
11470 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011471
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011472 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011473 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011474 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011475
Guido van Rossumd57fd912000-03-10 22:53:23 +000011476 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011477 for (i = 0; i < length; i++) {
11478 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011479
Benjamin Peterson29060642009-01-31 22:14:21 +000011480 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11481 return PyBool_FromLong(0);
11482 else if (!cased && Py_UNICODE_ISLOWER(ch))
11483 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011485 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486}
11487
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011488PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011489 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011491Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011492at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493
11494static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011495unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011497 Py_ssize_t i, length;
11498 int kind;
11499 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500 int cased;
11501
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011502 if (PyUnicode_READY(self) == -1)
11503 return NULL;
11504 length = PyUnicode_GET_LENGTH(self);
11505 kind = PyUnicode_KIND(self);
11506 data = PyUnicode_DATA(self);
11507
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011509 if (length == 1)
11510 return PyBool_FromLong(
11511 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011513 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011514 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011515 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011516
Guido van Rossumd57fd912000-03-10 22:53:23 +000011517 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011518 for (i = 0; i < length; i++) {
11519 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011520
Benjamin Peterson29060642009-01-31 22:14:21 +000011521 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11522 return PyBool_FromLong(0);
11523 else if (!cased && Py_UNICODE_ISUPPER(ch))
11524 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011525 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011526 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527}
11528
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011529PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011530 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011531\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011532Return True if S is a titlecased string and there is at least one\n\
11533character in S, i.e. upper- and titlecase characters may only\n\
11534follow uncased characters and lowercase characters only cased ones.\n\
11535Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536
11537static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011538unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011540 Py_ssize_t i, length;
11541 int kind;
11542 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011543 int cased, previous_is_cased;
11544
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011545 if (PyUnicode_READY(self) == -1)
11546 return NULL;
11547 length = PyUnicode_GET_LENGTH(self);
11548 kind = PyUnicode_KIND(self);
11549 data = PyUnicode_DATA(self);
11550
Guido van Rossumd57fd912000-03-10 22:53:23 +000011551 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011552 if (length == 1) {
11553 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11554 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11555 (Py_UNICODE_ISUPPER(ch) != 0));
11556 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011557
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011558 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011559 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011560 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011561
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562 cased = 0;
11563 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011564 for (i = 0; i < length; i++) {
11565 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011566
Benjamin Peterson29060642009-01-31 22:14:21 +000011567 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11568 if (previous_is_cased)
11569 return PyBool_FromLong(0);
11570 previous_is_cased = 1;
11571 cased = 1;
11572 }
11573 else if (Py_UNICODE_ISLOWER(ch)) {
11574 if (!previous_is_cased)
11575 return PyBool_FromLong(0);
11576 previous_is_cased = 1;
11577 cased = 1;
11578 }
11579 else
11580 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011581 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011582 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011583}
11584
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011585PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011586 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011587\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011588Return True if all characters in S are whitespace\n\
11589and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011590
11591static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011592unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011593{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011594 Py_ssize_t i, length;
11595 int kind;
11596 void *data;
11597
11598 if (PyUnicode_READY(self) == -1)
11599 return NULL;
11600 length = PyUnicode_GET_LENGTH(self);
11601 kind = PyUnicode_KIND(self);
11602 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603
Guido van Rossumd57fd912000-03-10 22:53:23 +000011604 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011605 if (length == 1)
11606 return PyBool_FromLong(
11607 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011609 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011610 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011611 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011612
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011613 for (i = 0; i < length; i++) {
11614 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011615 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011616 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011618 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619}
11620
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011621PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011622 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011623\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011624Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011625and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011626
11627static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011628unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011629{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011630 Py_ssize_t i, length;
11631 int kind;
11632 void *data;
11633
11634 if (PyUnicode_READY(self) == -1)
11635 return NULL;
11636 length = PyUnicode_GET_LENGTH(self);
11637 kind = PyUnicode_KIND(self);
11638 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011639
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011640 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011641 if (length == 1)
11642 return PyBool_FromLong(
11643 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011644
11645 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011646 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011647 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011648
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011649 for (i = 0; i < length; i++) {
11650 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011651 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011652 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011653 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011654}
11655
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011656PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011657 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011658\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011659Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011660and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011661
11662static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011663unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011664{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011665 int kind;
11666 void *data;
11667 Py_ssize_t len, i;
11668
11669 if (PyUnicode_READY(self) == -1)
11670 return NULL;
11671
11672 kind = PyUnicode_KIND(self);
11673 data = PyUnicode_DATA(self);
11674 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011675
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011676 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011677 if (len == 1) {
11678 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11679 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11680 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011681
11682 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011683 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011684 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011685
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011686 for (i = 0; i < len; i++) {
11687 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011688 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011689 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011690 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011691 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011692}
11693
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011694PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011695 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011696\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011697Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011698False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011699
11700static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011701unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011703 Py_ssize_t i, length;
11704 int kind;
11705 void *data;
11706
11707 if (PyUnicode_READY(self) == -1)
11708 return NULL;
11709 length = PyUnicode_GET_LENGTH(self);
11710 kind = PyUnicode_KIND(self);
11711 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011714 if (length == 1)
11715 return PyBool_FromLong(
11716 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011718 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011719 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011720 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011721
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011722 for (i = 0; i < length; i++) {
11723 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011724 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011725 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011726 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011727}
11728
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011729PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011730 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011731\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011732Return True if all characters in S are digits\n\
11733and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011734
11735static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011736unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011738 Py_ssize_t i, length;
11739 int kind;
11740 void *data;
11741
11742 if (PyUnicode_READY(self) == -1)
11743 return NULL;
11744 length = PyUnicode_GET_LENGTH(self);
11745 kind = PyUnicode_KIND(self);
11746 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747
Guido van Rossumd57fd912000-03-10 22:53:23 +000011748 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011749 if (length == 1) {
11750 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11751 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11752 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011754 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011755 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011756 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011757
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011758 for (i = 0; i < length; i++) {
11759 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011760 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011762 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763}
11764
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011765PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011766 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011768Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011769False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770
11771static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011772unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011774 Py_ssize_t i, length;
11775 int kind;
11776 void *data;
11777
11778 if (PyUnicode_READY(self) == -1)
11779 return NULL;
11780 length = PyUnicode_GET_LENGTH(self);
11781 kind = PyUnicode_KIND(self);
11782 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 if (length == 1)
11786 return PyBool_FromLong(
11787 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011789 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011790 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011791 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011792
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011793 for (i = 0; i < length; i++) {
11794 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011795 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011796 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011797 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798}
11799
Martin v. Löwis47383402007-08-15 07:32:56 +000011800int
11801PyUnicode_IsIdentifier(PyObject *self)
11802{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011803 int kind;
11804 void *data;
11805 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011806 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011808 if (PyUnicode_READY(self) == -1) {
11809 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011810 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011811 }
11812
11813 /* Special case for empty strings */
11814 if (PyUnicode_GET_LENGTH(self) == 0)
11815 return 0;
11816 kind = PyUnicode_KIND(self);
11817 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011818
11819 /* PEP 3131 says that the first character must be in
11820 XID_Start and subsequent characters in XID_Continue,
11821 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011822 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011823 letters, digits, underscore). However, given the current
11824 definition of XID_Start and XID_Continue, it is sufficient
11825 to check just for these, except that _ must be allowed
11826 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011827 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011828 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011829 return 0;
11830
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011831 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011832 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011833 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011834 return 1;
11835}
11836
11837PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011838 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011839\n\
11840Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011841to the language definition.\n\
11842\n\
11843Use keyword.iskeyword() to test for reserved identifiers\n\
11844such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011845
11846static PyObject*
11847unicode_isidentifier(PyObject *self)
11848{
11849 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11850}
11851
Georg Brandl559e5d72008-06-11 18:37:52 +000011852PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011853 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011854\n\
11855Return True if all characters in S are considered\n\
11856printable in repr() or S is empty, False otherwise.");
11857
11858static PyObject*
11859unicode_isprintable(PyObject *self)
11860{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011861 Py_ssize_t i, length;
11862 int kind;
11863 void *data;
11864
11865 if (PyUnicode_READY(self) == -1)
11866 return NULL;
11867 length = PyUnicode_GET_LENGTH(self);
11868 kind = PyUnicode_KIND(self);
11869 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011870
11871 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011872 if (length == 1)
11873 return PyBool_FromLong(
11874 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011875
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011876 for (i = 0; i < length; i++) {
11877 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011878 Py_RETURN_FALSE;
11879 }
11880 }
11881 Py_RETURN_TRUE;
11882}
11883
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011884PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011885 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886\n\
11887Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011888iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889
11890static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011891unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011893 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011894}
11895
Martin v. Löwis18e16552006-02-15 17:27:45 +000011896static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011897unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011899 if (PyUnicode_READY(self) == -1)
11900 return -1;
11901 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902}
11903
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011904PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011905 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011907Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011908done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011909
11910static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011911unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011913 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 Py_UCS4 fillchar = ' ';
11915
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011916 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917 return NULL;
11918
Benjamin Petersonbac79492012-01-14 13:34:47 -050011919 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011920 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011921
Victor Stinnerc4b49542011-12-11 22:44:26 +010011922 if (PyUnicode_GET_LENGTH(self) >= width)
11923 return unicode_result_unchanged(self);
11924
11925 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926}
11927
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011928PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011929 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011930\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011931Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011932
11933static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011934unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011935{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011936 if (PyUnicode_READY(self) == -1)
11937 return NULL;
11938 if (PyUnicode_IS_ASCII(self))
11939 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011940 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011941}
11942
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011943#define LEFTSTRIP 0
11944#define RIGHTSTRIP 1
11945#define BOTHSTRIP 2
11946
11947/* Arrays indexed by above */
11948static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11949
11950#define STRIPNAME(i) (stripformat[i]+3)
11951
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011952/* externally visible for str.strip(unicode) */
11953PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011954_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011955{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011956 void *data;
11957 int kind;
11958 Py_ssize_t i, j, len;
11959 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011960 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011962 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11963 return NULL;
11964
11965 kind = PyUnicode_KIND(self);
11966 data = PyUnicode_DATA(self);
11967 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020011968 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011969 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11970 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020011971 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011972
Benjamin Peterson14339b62009-01-31 16:36:08 +000011973 i = 0;
11974 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011975 while (i < len) {
11976 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11977 if (!BLOOM(sepmask, ch))
11978 break;
11979 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11980 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011981 i++;
11982 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011983 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011984
Benjamin Peterson14339b62009-01-31 16:36:08 +000011985 j = len;
11986 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011987 j--;
11988 while (j >= i) {
11989 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11990 if (!BLOOM(sepmask, ch))
11991 break;
11992 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11993 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011994 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011995 }
11996
Benjamin Peterson29060642009-01-31 22:14:21 +000011997 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011998 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011999
Victor Stinner7931d9a2011-11-04 00:22:48 +010012000 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012001}
12002
12003PyObject*
12004PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12005{
12006 unsigned char *data;
12007 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012008 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012009
Victor Stinnerde636f32011-10-01 03:55:54 +020012010 if (PyUnicode_READY(self) == -1)
12011 return NULL;
12012
Victor Stinner684d5fd2012-05-03 02:32:34 +020012013 length = PyUnicode_GET_LENGTH(self);
12014 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012015
Victor Stinner684d5fd2012-05-03 02:32:34 +020012016 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012017 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012018
Victor Stinnerde636f32011-10-01 03:55:54 +020012019 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012020 PyErr_SetString(PyExc_IndexError, "string index out of range");
12021 return NULL;
12022 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012023 if (start >= length || end < start)
12024 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012025
Victor Stinner684d5fd2012-05-03 02:32:34 +020012026 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012027 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012028 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012029 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012030 }
12031 else {
12032 kind = PyUnicode_KIND(self);
12033 data = PyUnicode_1BYTE_DATA(self);
12034 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012035 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012036 length);
12037 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012038}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012039
12040static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012041do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012042{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012043 Py_ssize_t len, i, j;
12044
12045 if (PyUnicode_READY(self) == -1)
12046 return NULL;
12047
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012049
Victor Stinnercc7af722013-04-09 22:39:24 +020012050 if (PyUnicode_IS_ASCII(self)) {
12051 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12052
12053 i = 0;
12054 if (striptype != RIGHTSTRIP) {
12055 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012056 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012057 if (!_Py_ascii_whitespace[ch])
12058 break;
12059 i++;
12060 }
12061 }
12062
12063 j = len;
12064 if (striptype != LEFTSTRIP) {
12065 j--;
12066 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012067 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012068 if (!_Py_ascii_whitespace[ch])
12069 break;
12070 j--;
12071 }
12072 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012073 }
12074 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012075 else {
12076 int kind = PyUnicode_KIND(self);
12077 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012078
Victor Stinnercc7af722013-04-09 22:39:24 +020012079 i = 0;
12080 if (striptype != RIGHTSTRIP) {
12081 while (i < len) {
12082 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12083 if (!Py_UNICODE_ISSPACE(ch))
12084 break;
12085 i++;
12086 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012087 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012088
12089 j = len;
12090 if (striptype != LEFTSTRIP) {
12091 j--;
12092 while (j >= i) {
12093 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12094 if (!Py_UNICODE_ISSPACE(ch))
12095 break;
12096 j--;
12097 }
12098 j++;
12099 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012100 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012101
Victor Stinner7931d9a2011-11-04 00:22:48 +010012102 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012103}
12104
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012105
12106static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012107do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012108{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012109 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012110
Serhiy Storchakac6792272013-10-19 21:03:34 +030012111 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012112 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012113
Benjamin Peterson14339b62009-01-31 16:36:08 +000012114 if (sep != NULL && sep != Py_None) {
12115 if (PyUnicode_Check(sep))
12116 return _PyUnicode_XStrip(self, striptype, sep);
12117 else {
12118 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012119 "%s arg must be None or str",
12120 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012121 return NULL;
12122 }
12123 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012124
Benjamin Peterson14339b62009-01-31 16:36:08 +000012125 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012126}
12127
12128
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012129PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012130 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012131\n\
12132Return a copy of the string S with leading and trailing\n\
12133whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012134If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012135
12136static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012137unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012138{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012139 if (PyTuple_GET_SIZE(args) == 0)
12140 return do_strip(self, BOTHSTRIP); /* Common case */
12141 else
12142 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012143}
12144
12145
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012146PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012147 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012148\n\
12149Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012150If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012151
12152static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012153unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012154{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012155 if (PyTuple_GET_SIZE(args) == 0)
12156 return do_strip(self, LEFTSTRIP); /* Common case */
12157 else
12158 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012159}
12160
12161
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012162PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012163 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012164\n\
12165Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012166If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012167
12168static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012169unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012170{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012171 if (PyTuple_GET_SIZE(args) == 0)
12172 return do_strip(self, RIGHTSTRIP); /* Common case */
12173 else
12174 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012175}
12176
12177
Guido van Rossumd57fd912000-03-10 22:53:23 +000012178static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012179unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012180{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012181 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012182 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012183
Serhiy Storchaka05997252013-01-26 12:14:02 +020012184 if (len < 1)
12185 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012186
Victor Stinnerc4b49542011-12-11 22:44:26 +010012187 /* no repeat, return original string */
12188 if (len == 1)
12189 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012190
Benjamin Petersonbac79492012-01-14 13:34:47 -050012191 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012192 return NULL;
12193
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012194 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012195 PyErr_SetString(PyExc_OverflowError,
12196 "repeated string is too long");
12197 return NULL;
12198 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012199 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012200
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012201 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012202 if (!u)
12203 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012204 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012205
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012206 if (PyUnicode_GET_LENGTH(str) == 1) {
12207 const int kind = PyUnicode_KIND(str);
12208 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012209 if (kind == PyUnicode_1BYTE_KIND) {
12210 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012211 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012212 }
12213 else if (kind == PyUnicode_2BYTE_KIND) {
12214 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012215 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012216 ucs2[n] = fill_char;
12217 } else {
12218 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12219 assert(kind == PyUnicode_4BYTE_KIND);
12220 for (n = 0; n < len; ++n)
12221 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012222 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012223 }
12224 else {
12225 /* number of characters copied this far */
12226 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012227 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012228 char *to = (char *) PyUnicode_DATA(u);
12229 Py_MEMCPY(to, PyUnicode_DATA(str),
12230 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012231 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012232 n = (done <= nchars-done) ? done : nchars-done;
12233 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012234 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012236 }
12237
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012238 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012239 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012240}
12241
Alexander Belopolsky40018472011-02-26 01:02:56 +000012242PyObject *
12243PyUnicode_Replace(PyObject *obj,
12244 PyObject *subobj,
12245 PyObject *replobj,
12246 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012247{
12248 PyObject *self;
12249 PyObject *str1;
12250 PyObject *str2;
12251 PyObject *result;
12252
12253 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012254 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012255 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012257 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012258 Py_DECREF(self);
12259 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012260 }
12261 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012262 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012263 Py_DECREF(self);
12264 Py_DECREF(str1);
12265 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012266 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012267 if (PyUnicode_READY(self) == -1 ||
12268 PyUnicode_READY(str1) == -1 ||
12269 PyUnicode_READY(str2) == -1)
12270 result = NULL;
12271 else
12272 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012273 Py_DECREF(self);
12274 Py_DECREF(str1);
12275 Py_DECREF(str2);
12276 return result;
12277}
12278
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012279PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012280 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012281\n\
12282Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012283old replaced by new. If the optional argument count is\n\
12284given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012285
12286static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012287unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012288{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012289 PyObject *str1;
12290 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012291 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012292 PyObject *result;
12293
Martin v. Löwis18e16552006-02-15 17:27:45 +000012294 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012295 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012296 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012297 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012298 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012299 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012300 return NULL;
12301 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012302 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012303 Py_DECREF(str1);
12304 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012305 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012306 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12307 result = NULL;
12308 else
12309 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012310
12311 Py_DECREF(str1);
12312 Py_DECREF(str2);
12313 return result;
12314}
12315
Alexander Belopolsky40018472011-02-26 01:02:56 +000012316static PyObject *
12317unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012318{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012319 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012320 Py_ssize_t isize;
12321 Py_ssize_t osize, squote, dquote, i, o;
12322 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012323 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012324 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012325
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012326 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012327 return NULL;
12328
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012329 isize = PyUnicode_GET_LENGTH(unicode);
12330 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012331
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012332 /* Compute length of output, quote characters, and
12333 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012334 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012335 max = 127;
12336 squote = dquote = 0;
12337 ikind = PyUnicode_KIND(unicode);
12338 for (i = 0; i < isize; i++) {
12339 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012340 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012341 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012342 case '\'': squote++; break;
12343 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012344 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012345 incr = 2;
12346 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012347 default:
12348 /* Fast-path ASCII */
12349 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012350 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012352 ;
12353 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012354 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012355 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012356 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012357 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012358 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012359 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012360 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012361 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012362 if (osize > PY_SSIZE_T_MAX - incr) {
12363 PyErr_SetString(PyExc_OverflowError,
12364 "string is too long to generate repr");
12365 return NULL;
12366 }
12367 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012368 }
12369
12370 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012371 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012372 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012373 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012374 if (dquote)
12375 /* Both squote and dquote present. Use squote,
12376 and escape them */
12377 osize += squote;
12378 else
12379 quote = '"';
12380 }
Victor Stinner55c08782013-04-14 18:45:39 +020012381 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012382
12383 repr = PyUnicode_New(osize, max);
12384 if (repr == NULL)
12385 return NULL;
12386 okind = PyUnicode_KIND(repr);
12387 odata = PyUnicode_DATA(repr);
12388
12389 PyUnicode_WRITE(okind, odata, 0, quote);
12390 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012391 if (unchanged) {
12392 _PyUnicode_FastCopyCharacters(repr, 1,
12393 unicode, 0,
12394 isize);
12395 }
12396 else {
12397 for (i = 0, o = 1; i < isize; i++) {
12398 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012399
Victor Stinner55c08782013-04-14 18:45:39 +020012400 /* Escape quotes and backslashes */
12401 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012402 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012403 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012404 continue;
12405 }
12406
12407 /* Map special whitespace to '\t', \n', '\r' */
12408 if (ch == '\t') {
12409 PyUnicode_WRITE(okind, odata, o++, '\\');
12410 PyUnicode_WRITE(okind, odata, o++, 't');
12411 }
12412 else if (ch == '\n') {
12413 PyUnicode_WRITE(okind, odata, o++, '\\');
12414 PyUnicode_WRITE(okind, odata, o++, 'n');
12415 }
12416 else if (ch == '\r') {
12417 PyUnicode_WRITE(okind, odata, o++, '\\');
12418 PyUnicode_WRITE(okind, odata, o++, 'r');
12419 }
12420
12421 /* Map non-printable US ASCII to '\xhh' */
12422 else if (ch < ' ' || ch == 0x7F) {
12423 PyUnicode_WRITE(okind, odata, o++, '\\');
12424 PyUnicode_WRITE(okind, odata, o++, 'x');
12425 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12426 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12427 }
12428
12429 /* Copy ASCII characters as-is */
12430 else if (ch < 0x7F) {
12431 PyUnicode_WRITE(okind, odata, o++, ch);
12432 }
12433
12434 /* Non-ASCII characters */
12435 else {
12436 /* Map Unicode whitespace and control characters
12437 (categories Z* and C* except ASCII space)
12438 */
12439 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12440 PyUnicode_WRITE(okind, odata, o++, '\\');
12441 /* Map 8-bit characters to '\xhh' */
12442 if (ch <= 0xff) {
12443 PyUnicode_WRITE(okind, odata, o++, 'x');
12444 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12445 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12446 }
12447 /* Map 16-bit characters to '\uxxxx' */
12448 else if (ch <= 0xffff) {
12449 PyUnicode_WRITE(okind, odata, o++, 'u');
12450 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12451 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12452 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12453 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12454 }
12455 /* Map 21-bit characters to '\U00xxxxxx' */
12456 else {
12457 PyUnicode_WRITE(okind, odata, o++, 'U');
12458 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12459 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12460 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12461 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12462 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12463 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12464 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12465 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12466 }
12467 }
12468 /* Copy characters as-is */
12469 else {
12470 PyUnicode_WRITE(okind, odata, o++, ch);
12471 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012472 }
12473 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012474 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012475 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012476 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012477 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012478}
12479
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012480PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012481 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012482\n\
12483Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012484such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012485arguments start and end are interpreted as in slice notation.\n\
12486\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012487Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012488
12489static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012490unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012491{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012492 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012493 Py_ssize_t start;
12494 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012495 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012496
Jesus Ceaac451502011-04-20 17:09:23 +020012497 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12498 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012499 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012500
Christian Heimesea71a522013-06-29 21:17:34 +020012501 if (PyUnicode_READY(self) == -1) {
12502 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012503 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012504 }
12505 if (PyUnicode_READY(substring) == -1) {
12506 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012507 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012508 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012509
Victor Stinner7931d9a2011-11-04 00:22:48 +010012510 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012511
12512 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012513
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012514 if (result == -2)
12515 return NULL;
12516
Christian Heimes217cfd12007-12-02 14:31:20 +000012517 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012518}
12519
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012520PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012521 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012522\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012523Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524
12525static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012526unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012527{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012528 PyObject *substring;
Christian Heimes9cd17752007-11-18 19:35:23 +000012529 Py_ssize_t start;
12530 Py_ssize_t end;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012531 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012532
Jesus Ceaac451502011-04-20 17:09:23 +020012533 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12534 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012535 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012536
Christian Heimesea71a522013-06-29 21:17:34 +020012537 if (PyUnicode_READY(self) == -1) {
12538 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012539 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012540 }
12541 if (PyUnicode_READY(substring) == -1) {
12542 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012543 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012544 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012545
Victor Stinner7931d9a2011-11-04 00:22:48 +010012546 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012547
12548 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012549
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 if (result == -2)
12551 return NULL;
12552
Guido van Rossumd57fd912000-03-10 22:53:23 +000012553 if (result < 0) {
12554 PyErr_SetString(PyExc_ValueError, "substring not found");
12555 return NULL;
12556 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012557
Christian Heimes217cfd12007-12-02 14:31:20 +000012558 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012559}
12560
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012561PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012562 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012563\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012564Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012565done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012566
12567static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012568unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012569{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012570 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012571 Py_UCS4 fillchar = ' ';
12572
Victor Stinnere9a29352011-10-01 02:14:59 +020012573 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012574 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012575
Benjamin Petersonbac79492012-01-14 13:34:47 -050012576 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012577 return NULL;
12578
Victor Stinnerc4b49542011-12-11 22:44:26 +010012579 if (PyUnicode_GET_LENGTH(self) >= width)
12580 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012581
Victor Stinnerc4b49542011-12-11 22:44:26 +010012582 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012583}
12584
Alexander Belopolsky40018472011-02-26 01:02:56 +000012585PyObject *
12586PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012587{
12588 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012589
Guido van Rossumd57fd912000-03-10 22:53:23 +000012590 s = PyUnicode_FromObject(s);
12591 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012592 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012593 if (sep != NULL) {
12594 sep = PyUnicode_FromObject(sep);
12595 if (sep == NULL) {
12596 Py_DECREF(s);
12597 return NULL;
12598 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012599 }
12600
Victor Stinner9310abb2011-10-05 00:59:23 +020012601 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012602
12603 Py_DECREF(s);
12604 Py_XDECREF(sep);
12605 return result;
12606}
12607
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012608PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012609 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012610\n\
12611Return a list of the words in S, using sep as the\n\
12612delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012613splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012614whitespace string is a separator and empty strings are\n\
12615removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012616
12617static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012618unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012619{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012620 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012621 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012622 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012623
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012624 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12625 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012626 return NULL;
12627
12628 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012629 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012630 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012631 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012632 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012633 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012634}
12635
Thomas Wouters477c8d52006-05-27 19:21:47 +000012636PyObject *
12637PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12638{
12639 PyObject* str_obj;
12640 PyObject* sep_obj;
12641 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012642 int kind1, kind2, kind;
12643 void *buf1 = NULL, *buf2 = NULL;
12644 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012645
12646 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012647 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012648 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012649 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012650 if (!sep_obj) {
12651 Py_DECREF(str_obj);
12652 return NULL;
12653 }
12654 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12655 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012656 Py_DECREF(str_obj);
12657 return NULL;
12658 }
12659
Victor Stinner14f8f022011-10-05 20:58:25 +020012660 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012661 kind2 = PyUnicode_KIND(sep_obj);
Victor Stinner14f8f022011-10-05 20:58:25 +020012662 kind = Py_MAX(kind1, kind2);
12663 buf1 = PyUnicode_DATA(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012664 if (kind1 != kind)
Victor Stinner14f8f022011-10-05 20:58:25 +020012665 buf1 = _PyUnicode_AsKind(str_obj, kind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012666 if (!buf1)
12667 goto onError;
12668 buf2 = PyUnicode_DATA(sep_obj);
12669 if (kind2 != kind)
12670 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12671 if (!buf2)
12672 goto onError;
12673 len1 = PyUnicode_GET_LENGTH(str_obj);
12674 len2 = PyUnicode_GET_LENGTH(sep_obj);
12675
Benjamin Petersonead6b532011-12-20 17:23:42 -060012676 switch (PyUnicode_KIND(str_obj)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012677 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012678 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12679 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12680 else
12681 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012682 break;
12683 case PyUnicode_2BYTE_KIND:
12684 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12685 break;
12686 case PyUnicode_4BYTE_KIND:
12687 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12688 break;
12689 default:
12690 assert(0);
12691 out = 0;
12692 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012693
12694 Py_DECREF(sep_obj);
12695 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012696 if (kind1 != kind)
12697 PyMem_Free(buf1);
12698 if (kind2 != kind)
12699 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012700
12701 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012702 onError:
12703 Py_DECREF(sep_obj);
12704 Py_DECREF(str_obj);
12705 if (kind1 != kind && buf1)
12706 PyMem_Free(buf1);
12707 if (kind2 != kind && buf2)
12708 PyMem_Free(buf2);
12709 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012710}
12711
12712
12713PyObject *
12714PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12715{
12716 PyObject* str_obj;
12717 PyObject* sep_obj;
12718 PyObject* out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012719 int kind1, kind2, kind;
12720 void *buf1 = NULL, *buf2 = NULL;
12721 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012722
12723 str_obj = PyUnicode_FromObject(str_in);
12724 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012725 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012726 sep_obj = PyUnicode_FromObject(sep_in);
12727 if (!sep_obj) {
12728 Py_DECREF(str_obj);
12729 return NULL;
12730 }
12731
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012732 kind1 = PyUnicode_KIND(str_in);
12733 kind2 = PyUnicode_KIND(sep_obj);
Georg Brandl4cb0de22011-09-28 21:49:49 +020012734 kind = Py_MAX(kind1, kind2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012735 buf1 = PyUnicode_DATA(str_in);
12736 if (kind1 != kind)
12737 buf1 = _PyUnicode_AsKind(str_in, kind);
12738 if (!buf1)
12739 goto onError;
12740 buf2 = PyUnicode_DATA(sep_obj);
12741 if (kind2 != kind)
12742 buf2 = _PyUnicode_AsKind(sep_obj, kind);
12743 if (!buf2)
12744 goto onError;
12745 len1 = PyUnicode_GET_LENGTH(str_obj);
12746 len2 = PyUnicode_GET_LENGTH(sep_obj);
12747
Benjamin Petersonead6b532011-12-20 17:23:42 -060012748 switch (PyUnicode_KIND(str_in)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012749 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012750 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12751 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12752 else
12753 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012754 break;
12755 case PyUnicode_2BYTE_KIND:
12756 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12757 break;
12758 case PyUnicode_4BYTE_KIND:
12759 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12760 break;
12761 default:
12762 assert(0);
12763 out = 0;
12764 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012765
12766 Py_DECREF(sep_obj);
12767 Py_DECREF(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012768 if (kind1 != kind)
12769 PyMem_Free(buf1);
12770 if (kind2 != kind)
12771 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012772
12773 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012774 onError:
12775 Py_DECREF(sep_obj);
12776 Py_DECREF(str_obj);
12777 if (kind1 != kind && buf1)
12778 PyMem_Free(buf1);
12779 if (kind2 != kind && buf2)
12780 PyMem_Free(buf2);
12781 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012782}
12783
12784PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012785 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012786\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012787Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012788the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012789found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012790
12791static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012792unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012793{
Victor Stinner9310abb2011-10-05 00:59:23 +020012794 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012795}
12796
12797PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012798 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012799\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012800Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012801the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012802separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012803
12804static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012805unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012806{
Victor Stinner9310abb2011-10-05 00:59:23 +020012807 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012808}
12809
Alexander Belopolsky40018472011-02-26 01:02:56 +000012810PyObject *
12811PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012812{
12813 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012814
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012815 s = PyUnicode_FromObject(s);
12816 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012817 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012818 if (sep != NULL) {
12819 sep = PyUnicode_FromObject(sep);
12820 if (sep == NULL) {
12821 Py_DECREF(s);
12822 return NULL;
12823 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012824 }
12825
Victor Stinner9310abb2011-10-05 00:59:23 +020012826 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012827
12828 Py_DECREF(s);
12829 Py_XDECREF(sep);
12830 return result;
12831}
12832
12833PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012834 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012835\n\
12836Return a list of the words in S, using sep as the\n\
12837delimiter string, starting at the end of the string and\n\
12838working to the front. If maxsplit is given, at most maxsplit\n\
12839splits are done. If sep is not specified, any whitespace string\n\
12840is a separator.");
12841
12842static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012843unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012844{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012845 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012846 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012847 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012848
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012849 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12850 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012851 return NULL;
12852
12853 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012854 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012855 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012856 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012857 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012858 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012859}
12860
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012861PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012862 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012863\n\
12864Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012865Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012866is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012867
12868static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012869unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012870{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012871 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012872 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012873
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012874 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12875 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012876 return NULL;
12877
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012878 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012879}
12880
12881static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012882PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012883{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012884 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012885}
12886
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012887PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012888 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012889\n\
12890Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012891and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012892
12893static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012894unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012895{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012896 if (PyUnicode_READY(self) == -1)
12897 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012898 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012899}
12900
Larry Hastings61272b72014-01-07 12:41:53 -080012901/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012902
Larry Hastings31826802013-10-19 00:09:25 -070012903@staticmethod
12904str.maketrans as unicode_maketrans
12905
12906 x: object
12907
12908 y: unicode=NULL
12909
12910 z: unicode=NULL
12911
12912 /
12913
12914Return a translation table usable for str.translate().
12915
12916If there is only one argument, it must be a dictionary mapping Unicode
12917ordinals (integers) or characters to Unicode ordinals, strings or None.
12918Character keys will be then converted to ordinals.
12919If there are two arguments, they must be strings of equal length, and
12920in the resulting dictionary, each character in x will be mapped to the
12921character at the same position in y. If there is a third argument, it
12922must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080012923[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070012924
12925PyDoc_STRVAR(unicode_maketrans__doc__,
Larry Hastings2623c8c2014-02-08 22:15:29 -080012926"maketrans(x, y=None, z=None, /)\n"
12927"--\n"
12928"\n"
Larry Hastings31826802013-10-19 00:09:25 -070012929"Return a translation table usable for str.translate().\n"
12930"\n"
Larry Hastings31826802013-10-19 00:09:25 -070012931"If there is only one argument, it must be a dictionary mapping Unicode\n"
12932"ordinals (integers) or characters to Unicode ordinals, strings or None.\n"
12933"Character keys will be then converted to ordinals.\n"
12934"If there are two arguments, they must be strings of equal length, and\n"
12935"in the resulting dictionary, each character in x will be mapped to the\n"
12936"character at the same position in y. If there is a third argument, it\n"
12937"must be a string, whose characters will be mapped to None in the result.");
12938
12939#define UNICODE_MAKETRANS_METHODDEF \
12940 {"maketrans", (PyCFunction)unicode_maketrans, METH_VARARGS|METH_STATIC, unicode_maketrans__doc__},
12941
12942static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012943unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z);
Larry Hastings31826802013-10-19 00:09:25 -070012944
12945static PyObject *
Larry Hastingsebdcb502013-11-23 14:54:00 -080012946unicode_maketrans(void *null, PyObject *args)
Georg Brandlceee0772007-11-27 23:48:05 +000012947{
Larry Hastings31826802013-10-19 00:09:25 -070012948 PyObject *return_value = NULL;
12949 PyObject *x;
12950 PyObject *y = NULL;
12951 PyObject *z = NULL;
12952
12953 if (!PyArg_ParseTuple(args,
12954 "O|UU:maketrans",
12955 &x, &y, &z))
12956 goto exit;
Larry Hastings5c661892014-01-24 06:17:25 -080012957 return_value = unicode_maketrans_impl(x, y, z);
Larry Hastings31826802013-10-19 00:09:25 -070012958
12959exit:
12960 return return_value;
12961}
12962
12963static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012964unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Larry Hastings2623c8c2014-02-08 22:15:29 -080012965/*[clinic end generated code: output=566edf630f77436a input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070012966{
Georg Brandlceee0772007-11-27 23:48:05 +000012967 PyObject *new = NULL, *key, *value;
12968 Py_ssize_t i = 0;
12969 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012970
Georg Brandlceee0772007-11-27 23:48:05 +000012971 new = PyDict_New();
12972 if (!new)
12973 return NULL;
12974 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012975 int x_kind, y_kind, z_kind;
12976 void *x_data, *y_data, *z_data;
12977
Georg Brandlceee0772007-11-27 23:48:05 +000012978 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012979 if (!PyUnicode_Check(x)) {
12980 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12981 "be a string if there is a second argument");
12982 goto err;
12983 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012984 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012985 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12986 "arguments must have equal length");
12987 goto err;
12988 }
12989 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012990 x_kind = PyUnicode_KIND(x);
12991 y_kind = PyUnicode_KIND(y);
12992 x_data = PyUnicode_DATA(x);
12993 y_data = PyUnicode_DATA(y);
12994 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12995 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012996 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012997 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012998 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012999 if (!value) {
13000 Py_DECREF(key);
13001 goto err;
13002 }
Georg Brandlceee0772007-11-27 23:48:05 +000013003 res = PyDict_SetItem(new, key, value);
13004 Py_DECREF(key);
13005 Py_DECREF(value);
13006 if (res < 0)
13007 goto err;
13008 }
13009 /* create entries for deleting chars in z */
13010 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013011 z_kind = PyUnicode_KIND(z);
13012 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013013 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013014 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013015 if (!key)
13016 goto err;
13017 res = PyDict_SetItem(new, key, Py_None);
13018 Py_DECREF(key);
13019 if (res < 0)
13020 goto err;
13021 }
13022 }
13023 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013024 int kind;
13025 void *data;
13026
Georg Brandlceee0772007-11-27 23:48:05 +000013027 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013028 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013029 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13030 "to maketrans it must be a dict");
13031 goto err;
13032 }
13033 /* copy entries into the new dict, converting string keys to int keys */
13034 while (PyDict_Next(x, &i, &key, &value)) {
13035 if (PyUnicode_Check(key)) {
13036 /* convert string keys to integer keys */
13037 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013038 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013039 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13040 "table must be of length 1");
13041 goto err;
13042 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013043 kind = PyUnicode_KIND(key);
13044 data = PyUnicode_DATA(key);
13045 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013046 if (!newkey)
13047 goto err;
13048 res = PyDict_SetItem(new, newkey, value);
13049 Py_DECREF(newkey);
13050 if (res < 0)
13051 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013052 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013053 /* just keep integer keys */
13054 if (PyDict_SetItem(new, key, value) < 0)
13055 goto err;
13056 } else {
13057 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13058 "be strings or integers");
13059 goto err;
13060 }
13061 }
13062 }
13063 return new;
13064 err:
13065 Py_DECREF(new);
13066 return NULL;
13067}
13068
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013069PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013070 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013071\n\
13072Return a copy of the string S, where all characters have been mapped\n\
13073through the given translation table, which must be a mapping of\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000013074Unicode ordinals to Unicode ordinals, strings, or None.\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +000013075Unmapped characters are left untouched. Characters mapped to None\n\
13076are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013077
13078static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013079unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013080{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013081 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013082}
13083
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013084PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013085 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013086\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013087Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013088
13089static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013090unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013091{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013092 if (PyUnicode_READY(self) == -1)
13093 return NULL;
13094 if (PyUnicode_IS_ASCII(self))
13095 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013096 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013097}
13098
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013099PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013100 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013101\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013102Pad a numeric string S with zeros on the left, to fill a field\n\
13103of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013104
13105static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013106unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013107{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013108 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013109 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013110 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013111 int kind;
13112 void *data;
13113 Py_UCS4 chr;
13114
Martin v. Löwis18e16552006-02-15 17:27:45 +000013115 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013116 return NULL;
13117
Benjamin Petersonbac79492012-01-14 13:34:47 -050013118 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013119 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013120
Victor Stinnerc4b49542011-12-11 22:44:26 +010013121 if (PyUnicode_GET_LENGTH(self) >= width)
13122 return unicode_result_unchanged(self);
13123
13124 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013125
13126 u = pad(self, fill, 0, '0');
13127
Walter Dörwald068325e2002-04-15 13:36:47 +000013128 if (u == NULL)
13129 return NULL;
13130
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013131 kind = PyUnicode_KIND(u);
13132 data = PyUnicode_DATA(u);
13133 chr = PyUnicode_READ(kind, data, fill);
13134
13135 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013136 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013137 PyUnicode_WRITE(kind, data, 0, chr);
13138 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013139 }
13140
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013141 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013142 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013143}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013144
13145#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013146static PyObject *
13147unicode__decimal2ascii(PyObject *self)
13148{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013149 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013150}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013151#endif
13152
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013153PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013154 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013155\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013156Return True if S starts with the specified prefix, False otherwise.\n\
13157With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013158With optional end, stop comparing S at that position.\n\
13159prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013160
13161static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013162unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013163 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013164{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013165 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013166 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013167 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013168 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013169 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013170
Jesus Ceaac451502011-04-20 17:09:23 +020013171 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013172 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013173 if (PyTuple_Check(subobj)) {
13174 Py_ssize_t i;
13175 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013176 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013177 if (substring == NULL)
13178 return NULL;
13179 result = tailmatch(self, substring, start, end, -1);
13180 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013181 if (result == -1)
13182 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013183 if (result) {
13184 Py_RETURN_TRUE;
13185 }
13186 }
13187 /* nothing matched */
13188 Py_RETURN_FALSE;
13189 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013190 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013191 if (substring == NULL) {
13192 if (PyErr_ExceptionMatches(PyExc_TypeError))
13193 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13194 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013195 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013196 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013197 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013198 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013199 if (result == -1)
13200 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013201 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013202}
13203
13204
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013205PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013206 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013207\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013208Return True if S ends with the specified suffix, False otherwise.\n\
13209With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013210With optional end, stop comparing S at that position.\n\
13211suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013212
13213static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013214unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013215 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013216{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013217 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013218 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013219 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013220 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013221 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013222
Jesus Ceaac451502011-04-20 17:09:23 +020013223 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013224 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013225 if (PyTuple_Check(subobj)) {
13226 Py_ssize_t i;
13227 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013228 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013229 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013230 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013231 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013232 result = tailmatch(self, substring, start, end, +1);
13233 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013234 if (result == -1)
13235 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013236 if (result) {
13237 Py_RETURN_TRUE;
13238 }
13239 }
13240 Py_RETURN_FALSE;
13241 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013242 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013243 if (substring == NULL) {
13244 if (PyErr_ExceptionMatches(PyExc_TypeError))
13245 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13246 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013247 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013248 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013249 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020013250 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013251 if (result == -1)
13252 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013253 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013254}
13255
Victor Stinner202fdca2012-05-07 12:47:02 +020013256Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013257_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013258{
Victor Stinner8f674cc2013-04-17 23:02:17 +020013259 if (!writer->readonly)
13260 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13261 else {
13262 /* Copy-on-write mode: set buffer size to 0 so
13263 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13264 * next write. */
13265 writer->size = 0;
13266 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013267 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13268 writer->data = PyUnicode_DATA(writer->buffer);
13269 writer->kind = PyUnicode_KIND(writer->buffer);
13270}
13271
Victor Stinnerd3f08822012-05-29 12:57:52 +020013272void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013273_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013274{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013275 memset(writer, 0, sizeof(*writer));
13276#ifdef Py_DEBUG
13277 writer->kind = 5; /* invalid kind */
13278#endif
Victor Stinner8f674cc2013-04-17 23:02:17 +020013279 writer->min_char = 127;
Victor Stinner202fdca2012-05-07 12:47:02 +020013280}
13281
Victor Stinnerd3f08822012-05-29 12:57:52 +020013282int
13283_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13284 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013285{
Victor Stinner6989ba02013-11-18 21:08:39 +010013286#ifdef MS_WINDOWS
13287 /* On Windows, overallocate by 50% is the best factor */
13288# define OVERALLOCATE_FACTOR 2
13289#else
13290 /* On Linux, overallocate by 25% is the best factor */
13291# define OVERALLOCATE_FACTOR 4
13292#endif
Victor Stinner202fdca2012-05-07 12:47:02 +020013293 Py_ssize_t newlen;
13294 PyObject *newbuffer;
13295
Victor Stinnerd3f08822012-05-29 12:57:52 +020013296 assert(length > 0);
13297
Victor Stinner202fdca2012-05-07 12:47:02 +020013298 if (length > PY_SSIZE_T_MAX - writer->pos) {
13299 PyErr_NoMemory();
13300 return -1;
13301 }
13302 newlen = writer->pos + length;
13303
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013304 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013305
Victor Stinnerd3f08822012-05-29 12:57:52 +020013306 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013307 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013308 if (writer->overallocate
13309 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13310 /* overallocate to limit the number of realloc() */
13311 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013312 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013313 if (newlen < writer->min_length)
13314 newlen = writer->min_length;
13315
Victor Stinnerd3f08822012-05-29 12:57:52 +020013316 writer->buffer = PyUnicode_New(newlen, maxchar);
13317 if (writer->buffer == NULL)
13318 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013319 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013320 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013321 if (writer->overallocate
13322 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13323 /* overallocate to limit the number of realloc() */
13324 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013325 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013326 if (newlen < writer->min_length)
13327 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013328
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013329 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013330 /* resize + widen */
13331 newbuffer = PyUnicode_New(newlen, maxchar);
13332 if (newbuffer == NULL)
13333 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013334 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13335 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013336 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013337 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013338 }
13339 else {
13340 newbuffer = resize_compact(writer->buffer, newlen);
13341 if (newbuffer == NULL)
13342 return -1;
13343 }
13344 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013345 }
13346 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013347 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013348 newbuffer = PyUnicode_New(writer->size, maxchar);
13349 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013350 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013351 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13352 writer->buffer, 0, writer->pos);
13353 Py_DECREF(writer->buffer);
13354 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013355 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013356 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013357 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013358
13359#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013360}
13361
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013362Py_LOCAL_INLINE(int)
13363_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013364{
13365 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13366 return -1;
13367 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13368 writer->pos++;
13369 return 0;
13370}
13371
13372int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013373_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13374{
13375 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13376}
13377
13378int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013379_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13380{
13381 Py_UCS4 maxchar;
13382 Py_ssize_t len;
13383
13384 if (PyUnicode_READY(str) == -1)
13385 return -1;
13386 len = PyUnicode_GET_LENGTH(str);
13387 if (len == 0)
13388 return 0;
13389 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13390 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013391 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013392 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013393 Py_INCREF(str);
13394 writer->buffer = str;
13395 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013396 writer->pos += len;
13397 return 0;
13398 }
13399 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13400 return -1;
13401 }
13402 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13403 str, 0, len);
13404 writer->pos += len;
13405 return 0;
13406}
13407
Victor Stinnere215d962012-10-06 23:03:36 +020013408int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013409_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13410 Py_ssize_t start, Py_ssize_t end)
13411{
13412 Py_UCS4 maxchar;
13413 Py_ssize_t len;
13414
13415 if (PyUnicode_READY(str) == -1)
13416 return -1;
13417
13418 assert(0 <= start);
13419 assert(end <= PyUnicode_GET_LENGTH(str));
13420 assert(start <= end);
13421
13422 if (end == 0)
13423 return 0;
13424
13425 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13426 return _PyUnicodeWriter_WriteStr(writer, str);
13427
13428 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13429 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13430 else
13431 maxchar = writer->maxchar;
13432 len = end - start;
13433
13434 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13435 return -1;
13436
13437 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13438 str, start, len);
13439 writer->pos += len;
13440 return 0;
13441}
13442
13443int
Victor Stinner4a587072013-11-19 12:54:53 +010013444_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13445 const char *ascii, Py_ssize_t len)
13446{
13447 if (len == -1)
13448 len = strlen(ascii);
13449
13450 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13451
13452 if (writer->buffer == NULL && !writer->overallocate) {
13453 PyObject *str;
13454
13455 str = _PyUnicode_FromASCII(ascii, len);
13456 if (str == NULL)
13457 return -1;
13458
13459 writer->readonly = 1;
13460 writer->buffer = str;
13461 _PyUnicodeWriter_Update(writer);
13462 writer->pos += len;
13463 return 0;
13464 }
13465
13466 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13467 return -1;
13468
13469 switch (writer->kind)
13470 {
13471 case PyUnicode_1BYTE_KIND:
13472 {
13473 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13474 Py_UCS1 *data = writer->data;
13475
13476 Py_MEMCPY(data + writer->pos, str, len);
13477 break;
13478 }
13479 case PyUnicode_2BYTE_KIND:
13480 {
13481 _PyUnicode_CONVERT_BYTES(
13482 Py_UCS1, Py_UCS2,
13483 ascii, ascii + len,
13484 (Py_UCS2 *)writer->data + writer->pos);
13485 break;
13486 }
13487 case PyUnicode_4BYTE_KIND:
13488 {
13489 _PyUnicode_CONVERT_BYTES(
13490 Py_UCS1, Py_UCS4,
13491 ascii, ascii + len,
13492 (Py_UCS4 *)writer->data + writer->pos);
13493 break;
13494 }
13495 default:
13496 assert(0);
13497 }
13498
13499 writer->pos += len;
13500 return 0;
13501}
13502
13503int
13504_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13505 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013506{
13507 Py_UCS4 maxchar;
13508
13509 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13510 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13511 return -1;
13512 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13513 writer->pos += len;
13514 return 0;
13515}
13516
Victor Stinnerd3f08822012-05-29 12:57:52 +020013517PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013518_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013519{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013520 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013521 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013522 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013523 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013524 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013525 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013526 str = writer->buffer;
13527 writer->buffer = NULL;
13528 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13529 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013530 }
13531 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13532 PyObject *newbuffer;
13533 newbuffer = resize_compact(writer->buffer, writer->pos);
13534 if (newbuffer == NULL) {
Serhiy Storchakadfe98a12014-02-09 13:46:20 +020013535 Py_CLEAR(writer->buffer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013536 return NULL;
13537 }
13538 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013539 }
Victor Stinner15a0bd32013-07-08 22:29:55 +020013540 str = writer->buffer;
13541 writer->buffer = NULL;
13542 assert(_PyUnicode_CheckConsistency(str, 1));
13543 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013544}
13545
Victor Stinnerd3f08822012-05-29 12:57:52 +020013546void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013547_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013548{
13549 Py_CLEAR(writer->buffer);
13550}
13551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013552#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013553
13554PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013555 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013556\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013557Return a formatted version of S, using substitutions from args and kwargs.\n\
13558The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013559
Eric Smith27bbca62010-11-04 17:06:58 +000013560PyDoc_STRVAR(format_map__doc__,
13561 "S.format_map(mapping) -> str\n\
13562\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013563Return a formatted version of S, using substitutions from mapping.\n\
13564The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013565
Eric Smith4a7d76d2008-05-30 18:10:19 +000013566static PyObject *
13567unicode__format__(PyObject* self, PyObject* args)
13568{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013569 PyObject *format_spec;
13570 _PyUnicodeWriter writer;
13571 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013572
13573 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13574 return NULL;
13575
Victor Stinnerd3f08822012-05-29 12:57:52 +020013576 if (PyUnicode_READY(self) == -1)
13577 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013578 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013579 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13580 self, format_spec, 0,
13581 PyUnicode_GET_LENGTH(format_spec));
13582 if (ret == -1) {
13583 _PyUnicodeWriter_Dealloc(&writer);
13584 return NULL;
13585 }
13586 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013587}
13588
Eric Smith8c663262007-08-25 02:26:07 +000013589PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013590 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013591\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013592Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013593
13594static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013595unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013596{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013597 Py_ssize_t size;
13598
13599 /* If it's a compact object, account for base structure +
13600 character data. */
13601 if (PyUnicode_IS_COMPACT_ASCII(v))
13602 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13603 else if (PyUnicode_IS_COMPACT(v))
13604 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013605 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013606 else {
13607 /* If it is a two-block object, account for base object, and
13608 for character block if present. */
13609 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013610 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013611 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013612 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013613 }
13614 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013615 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013616 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013617 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013618 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013619 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013620
13621 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013622}
13623
13624PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013625 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013626
13627static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013628unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013629{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013630 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013631 if (!copy)
13632 return NULL;
13633 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013634}
13635
Guido van Rossumd57fd912000-03-10 22:53:23 +000013636static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013637 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013638 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013639 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13640 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013641 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13642 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013643 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013644 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13645 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13646 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013647 {"expandtabs", (PyCFunction) unicode_expandtabs,
13648 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013649 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013650 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013651 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13652 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13653 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013654 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013655 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13656 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13657 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013658 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013659 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013660 {"splitlines", (PyCFunction) unicode_splitlines,
13661 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013662 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013663 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13664 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13665 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13666 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13667 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13668 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13669 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13670 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13671 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13672 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13673 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13674 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13675 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13676 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013677 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013678 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013679 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013680 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013681 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013682 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013683 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013684 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013685#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013686 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013687 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013688#endif
13689
Benjamin Peterson14339b62009-01-31 16:36:08 +000013690 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013691 {NULL, NULL}
13692};
13693
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013694static PyObject *
13695unicode_mod(PyObject *v, PyObject *w)
13696{
Brian Curtindfc80e32011-08-10 20:28:54 -050013697 if (!PyUnicode_Check(v))
13698 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013699 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013700}
13701
13702static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013703 0, /*nb_add*/
13704 0, /*nb_subtract*/
13705 0, /*nb_multiply*/
13706 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013707};
13708
Guido van Rossumd57fd912000-03-10 22:53:23 +000013709static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013710 (lenfunc) unicode_length, /* sq_length */
13711 PyUnicode_Concat, /* sq_concat */
13712 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13713 (ssizeargfunc) unicode_getitem, /* sq_item */
13714 0, /* sq_slice */
13715 0, /* sq_ass_item */
13716 0, /* sq_ass_slice */
13717 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013718};
13719
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013720static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013721unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013722{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013723 if (PyUnicode_READY(self) == -1)
13724 return NULL;
13725
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013726 if (PyIndex_Check(item)) {
13727 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013728 if (i == -1 && PyErr_Occurred())
13729 return NULL;
13730 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013731 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013732 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013733 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013734 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013735 PyObject *result;
13736 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013737 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013738 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013739
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013740 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013741 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013742 return NULL;
13743 }
13744
13745 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013746 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013747 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013748 slicelength == PyUnicode_GET_LENGTH(self)) {
13749 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013750 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013751 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013752 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013753 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013754 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013755 src_kind = PyUnicode_KIND(self);
13756 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013757 if (!PyUnicode_IS_ASCII(self)) {
13758 kind_limit = kind_maxchar_limit(src_kind);
13759 max_char = 0;
13760 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13761 ch = PyUnicode_READ(src_kind, src_data, cur);
13762 if (ch > max_char) {
13763 max_char = ch;
13764 if (max_char >= kind_limit)
13765 break;
13766 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013767 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013768 }
Victor Stinner55c99112011-10-13 01:17:06 +020013769 else
13770 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013771 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013772 if (result == NULL)
13773 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013774 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013775 dest_data = PyUnicode_DATA(result);
13776
13777 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013778 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13779 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013780 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013781 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013782 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013783 } else {
13784 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13785 return NULL;
13786 }
13787}
13788
13789static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013790 (lenfunc)unicode_length, /* mp_length */
13791 (binaryfunc)unicode_subscript, /* mp_subscript */
13792 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013793};
13794
Guido van Rossumd57fd912000-03-10 22:53:23 +000013795
Guido van Rossumd57fd912000-03-10 22:53:23 +000013796/* Helpers for PyUnicode_Format() */
13797
Victor Stinnera47082312012-10-04 02:19:54 +020013798struct unicode_formatter_t {
13799 PyObject *args;
13800 int args_owned;
13801 Py_ssize_t arglen, argidx;
13802 PyObject *dict;
13803
13804 enum PyUnicode_Kind fmtkind;
13805 Py_ssize_t fmtcnt, fmtpos;
13806 void *fmtdata;
13807 PyObject *fmtstr;
13808
13809 _PyUnicodeWriter writer;
13810};
13811
13812struct unicode_format_arg_t {
13813 Py_UCS4 ch;
13814 int flags;
13815 Py_ssize_t width;
13816 int prec;
13817 int sign;
13818};
13819
Guido van Rossumd57fd912000-03-10 22:53:23 +000013820static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013821unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013822{
Victor Stinnera47082312012-10-04 02:19:54 +020013823 Py_ssize_t argidx = ctx->argidx;
13824
13825 if (argidx < ctx->arglen) {
13826 ctx->argidx++;
13827 if (ctx->arglen < 0)
13828 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013829 else
Victor Stinnera47082312012-10-04 02:19:54 +020013830 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013831 }
13832 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013833 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013834 return NULL;
13835}
13836
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013837/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013838
Victor Stinnera47082312012-10-04 02:19:54 +020013839/* Format a float into the writer if the writer is not NULL, or into *p_output
13840 otherwise.
13841
13842 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013843static int
Victor Stinnera47082312012-10-04 02:19:54 +020013844formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13845 PyObject **p_output,
13846 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013847{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013848 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013849 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013850 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013851 int prec;
13852 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013853
Guido van Rossumd57fd912000-03-10 22:53:23 +000013854 x = PyFloat_AsDouble(v);
13855 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013856 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013857
Victor Stinnera47082312012-10-04 02:19:54 +020013858 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013859 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013860 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013861
Victor Stinnera47082312012-10-04 02:19:54 +020013862 if (arg->flags & F_ALT)
13863 dtoa_flags = Py_DTSF_ALT;
13864 else
13865 dtoa_flags = 0;
13866 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013867 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013868 return -1;
13869 len = strlen(p);
13870 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013871 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013872 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013873 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013874 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013875 }
13876 else
13877 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013878 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013879 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013880}
13881
Victor Stinnerd0880d52012-04-27 23:40:13 +020013882/* formatlong() emulates the format codes d, u, o, x and X, and
13883 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13884 * Python's regular ints.
13885 * Return value: a new PyUnicodeObject*, or NULL if error.
13886 * The output string is of the form
13887 * "-"? ("0x" | "0X")? digit+
13888 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13889 * set in flags. The case of hex digits will be correct,
13890 * There will be at least prec digits, zero-filled on the left if
13891 * necessary to get that many.
13892 * val object to be converted
13893 * flags bitmask of format flags; only F_ALT is looked at
13894 * prec minimum number of digits; 0-fill on left if needed
13895 * type a character in [duoxX]; u acts the same as d
13896 *
13897 * CAUTION: o, x and X conversions on regular ints can never
13898 * produce a '-' sign, but can for Python's unbounded ints.
13899 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013900PyObject *
13901_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000013902{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013903 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013904 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013905 Py_ssize_t i;
13906 int sign; /* 1 if '-', else 0 */
13907 int len; /* number of characters */
13908 Py_ssize_t llen;
13909 int numdigits; /* len == numnondigits + numdigits */
13910 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013911
Victor Stinnerd0880d52012-04-27 23:40:13 +020013912 /* Avoid exceeding SSIZE_T_MAX */
13913 if (prec > INT_MAX-3) {
13914 PyErr_SetString(PyExc_OverflowError,
13915 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013916 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013917 }
13918
13919 assert(PyLong_Check(val));
13920
13921 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013922 default:
13923 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013924 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013925 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013926 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013927 /* int and int subclasses should print numerically when a numeric */
13928 /* format code is used (see issue18780) */
13929 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013930 break;
13931 case 'o':
13932 numnondigits = 2;
13933 result = PyNumber_ToBase(val, 8);
13934 break;
13935 case 'x':
13936 case 'X':
13937 numnondigits = 2;
13938 result = PyNumber_ToBase(val, 16);
13939 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013940 }
13941 if (!result)
13942 return NULL;
13943
13944 assert(unicode_modifiable(result));
13945 assert(PyUnicode_IS_READY(result));
13946 assert(PyUnicode_IS_ASCII(result));
13947
13948 /* To modify the string in-place, there can only be one reference. */
13949 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013950 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013951 PyErr_BadInternalCall();
13952 return NULL;
13953 }
13954 buf = PyUnicode_DATA(result);
13955 llen = PyUnicode_GET_LENGTH(result);
13956 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013957 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013958 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080013959 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013960 return NULL;
13961 }
13962 len = (int)llen;
13963 sign = buf[0] == '-';
13964 numnondigits += sign;
13965 numdigits = len - numnondigits;
13966 assert(numdigits > 0);
13967
13968 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013969 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013970 (type == 'o' || type == 'x' || type == 'X'))) {
13971 assert(buf[sign] == '0');
13972 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13973 buf[sign+1] == 'o');
13974 numnondigits -= 2;
13975 buf += 2;
13976 len -= 2;
13977 if (sign)
13978 buf[0] = '-';
13979 assert(len == numnondigits + numdigits);
13980 assert(numdigits > 0);
13981 }
13982
13983 /* Fill with leading zeroes to meet minimum width. */
13984 if (prec > numdigits) {
13985 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13986 numnondigits + prec);
13987 char *b1;
13988 if (!r1) {
13989 Py_DECREF(result);
13990 return NULL;
13991 }
13992 b1 = PyBytes_AS_STRING(r1);
13993 for (i = 0; i < numnondigits; ++i)
13994 *b1++ = *buf++;
13995 for (i = 0; i < prec - numdigits; i++)
13996 *b1++ = '0';
13997 for (i = 0; i < numdigits; i++)
13998 *b1++ = *buf++;
13999 *b1 = '\0';
14000 Py_DECREF(result);
14001 result = r1;
14002 buf = PyBytes_AS_STRING(result);
14003 len = numnondigits + prec;
14004 }
14005
14006 /* Fix up case for hex conversions. */
14007 if (type == 'X') {
14008 /* Need to convert all lower case letters to upper case.
14009 and need to convert 0x to 0X (and -0x to -0X). */
14010 for (i = 0; i < len; i++)
14011 if (buf[i] >= 'a' && buf[i] <= 'x')
14012 buf[i] -= 'a'-'A';
14013 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014014 if (!PyUnicode_Check(result)
14015 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014016 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014017 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014018 Py_DECREF(result);
14019 result = unicode;
14020 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014021 else if (len != PyUnicode_GET_LENGTH(result)) {
14022 if (PyUnicode_Resize(&result, len) < 0)
14023 Py_CLEAR(result);
14024 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014025 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014026}
14027
Ethan Furmandf3ed242014-01-05 06:50:30 -080014028/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014029 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014030 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014031 * -1 and raise an exception on error */
14032static int
Victor Stinnera47082312012-10-04 02:19:54 +020014033mainformatlong(PyObject *v,
14034 struct unicode_format_arg_t *arg,
14035 PyObject **p_output,
14036 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014037{
14038 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014039 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014040
14041 if (!PyNumber_Check(v))
14042 goto wrongtype;
14043
Ethan Furman9ab74802014-03-21 06:38:46 -070014044 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014045 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014046 if (type == 'o' || type == 'x' || type == 'X') {
14047 iobj = PyNumber_Index(v);
14048 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014049 if (PyErr_ExceptionMatches(PyExc_TypeError))
14050 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014051 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014052 }
14053 }
14054 else {
14055 iobj = PyNumber_Long(v);
14056 if (iobj == NULL ) {
14057 if (PyErr_ExceptionMatches(PyExc_TypeError))
14058 goto wrongtype;
14059 return -1;
14060 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014061 }
14062 assert(PyLong_Check(iobj));
14063 }
14064 else {
14065 iobj = v;
14066 Py_INCREF(iobj);
14067 }
14068
14069 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014070 && arg->width == -1 && arg->prec == -1
14071 && !(arg->flags & (F_SIGN | F_BLANK))
14072 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014073 {
14074 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014075 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014076 int base;
14077
Victor Stinnera47082312012-10-04 02:19:54 +020014078 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014079 {
14080 default:
14081 assert(0 && "'type' not in [diuoxX]");
14082 case 'd':
14083 case 'i':
14084 case 'u':
14085 base = 10;
14086 break;
14087 case 'o':
14088 base = 8;
14089 break;
14090 case 'x':
14091 case 'X':
14092 base = 16;
14093 break;
14094 }
14095
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014096 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14097 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014098 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014099 }
14100 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014101 return 1;
14102 }
14103
Ethan Furmanb95b5612015-01-23 20:05:18 -080014104 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014105 Py_DECREF(iobj);
14106 if (res == NULL)
14107 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014108 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014109 return 0;
14110
14111wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014112 switch(type)
14113 {
14114 case 'o':
14115 case 'x':
14116 case 'X':
14117 PyErr_Format(PyExc_TypeError,
14118 "%%%c format: an integer is required, "
14119 "not %.200s",
14120 type, Py_TYPE(v)->tp_name);
14121 break;
14122 default:
14123 PyErr_Format(PyExc_TypeError,
14124 "%%%c format: a number is required, "
14125 "not %.200s",
14126 type, Py_TYPE(v)->tp_name);
14127 break;
14128 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014129 return -1;
14130}
14131
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014132static Py_UCS4
14133formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014134{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014135 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014136 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014137 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014138 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014139 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014140 goto onError;
14141 }
14142 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014143 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014144 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014145 /* make sure number is a type of integer */
14146 if (!PyLong_Check(v)) {
14147 iobj = PyNumber_Index(v);
14148 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014149 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014150 }
14151 v = iobj;
14152 Py_DECREF(iobj);
14153 }
14154 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014155 x = PyLong_AsLong(v);
14156 if (x == -1 && PyErr_Occurred())
14157 goto onError;
14158
Victor Stinner8faf8212011-12-08 22:14:11 +010014159 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014160 PyErr_SetString(PyExc_OverflowError,
14161 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014162 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014163 }
14164
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014165 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014166 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014167
Benjamin Peterson29060642009-01-31 22:14:21 +000014168 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014169 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014170 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014171 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014172}
14173
Victor Stinnera47082312012-10-04 02:19:54 +020014174/* Parse options of an argument: flags, width, precision.
14175 Handle also "%(name)" syntax.
14176
14177 Return 0 if the argument has been formatted into arg->str.
14178 Return 1 if the argument has been written into ctx->writer,
14179 Raise an exception and return -1 on error. */
14180static int
14181unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14182 struct unicode_format_arg_t *arg)
14183{
14184#define FORMAT_READ(ctx) \
14185 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14186
14187 PyObject *v;
14188
Victor Stinnera47082312012-10-04 02:19:54 +020014189 if (arg->ch == '(') {
14190 /* Get argument value from a dictionary. Example: "%(name)s". */
14191 Py_ssize_t keystart;
14192 Py_ssize_t keylen;
14193 PyObject *key;
14194 int pcount = 1;
14195
14196 if (ctx->dict == NULL) {
14197 PyErr_SetString(PyExc_TypeError,
14198 "format requires a mapping");
14199 return -1;
14200 }
14201 ++ctx->fmtpos;
14202 --ctx->fmtcnt;
14203 keystart = ctx->fmtpos;
14204 /* Skip over balanced parentheses */
14205 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14206 arg->ch = FORMAT_READ(ctx);
14207 if (arg->ch == ')')
14208 --pcount;
14209 else if (arg->ch == '(')
14210 ++pcount;
14211 ctx->fmtpos++;
14212 }
14213 keylen = ctx->fmtpos - keystart - 1;
14214 if (ctx->fmtcnt < 0 || pcount > 0) {
14215 PyErr_SetString(PyExc_ValueError,
14216 "incomplete format key");
14217 return -1;
14218 }
14219 key = PyUnicode_Substring(ctx->fmtstr,
14220 keystart, keystart + keylen);
14221 if (key == NULL)
14222 return -1;
14223 if (ctx->args_owned) {
14224 Py_DECREF(ctx->args);
14225 ctx->args_owned = 0;
14226 }
14227 ctx->args = PyObject_GetItem(ctx->dict, key);
14228 Py_DECREF(key);
14229 if (ctx->args == NULL)
14230 return -1;
14231 ctx->args_owned = 1;
14232 ctx->arglen = -1;
14233 ctx->argidx = -2;
14234 }
14235
14236 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014237 while (--ctx->fmtcnt >= 0) {
14238 arg->ch = FORMAT_READ(ctx);
14239 ctx->fmtpos++;
14240 switch (arg->ch) {
14241 case '-': arg->flags |= F_LJUST; continue;
14242 case '+': arg->flags |= F_SIGN; continue;
14243 case ' ': arg->flags |= F_BLANK; continue;
14244 case '#': arg->flags |= F_ALT; continue;
14245 case '0': arg->flags |= F_ZERO; continue;
14246 }
14247 break;
14248 }
14249
14250 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014251 if (arg->ch == '*') {
14252 v = unicode_format_getnextarg(ctx);
14253 if (v == NULL)
14254 return -1;
14255 if (!PyLong_Check(v)) {
14256 PyErr_SetString(PyExc_TypeError,
14257 "* wants int");
14258 return -1;
14259 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014260 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014261 if (arg->width == -1 && PyErr_Occurred())
14262 return -1;
14263 if (arg->width < 0) {
14264 arg->flags |= F_LJUST;
14265 arg->width = -arg->width;
14266 }
14267 if (--ctx->fmtcnt >= 0) {
14268 arg->ch = FORMAT_READ(ctx);
14269 ctx->fmtpos++;
14270 }
14271 }
14272 else if (arg->ch >= '0' && arg->ch <= '9') {
14273 arg->width = arg->ch - '0';
14274 while (--ctx->fmtcnt >= 0) {
14275 arg->ch = FORMAT_READ(ctx);
14276 ctx->fmtpos++;
14277 if (arg->ch < '0' || arg->ch > '9')
14278 break;
14279 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14280 mixing signed and unsigned comparison. Since arg->ch is between
14281 '0' and '9', casting to int is safe. */
14282 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14283 PyErr_SetString(PyExc_ValueError,
14284 "width too big");
14285 return -1;
14286 }
14287 arg->width = arg->width*10 + (arg->ch - '0');
14288 }
14289 }
14290
14291 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014292 if (arg->ch == '.') {
14293 arg->prec = 0;
14294 if (--ctx->fmtcnt >= 0) {
14295 arg->ch = FORMAT_READ(ctx);
14296 ctx->fmtpos++;
14297 }
14298 if (arg->ch == '*') {
14299 v = unicode_format_getnextarg(ctx);
14300 if (v == NULL)
14301 return -1;
14302 if (!PyLong_Check(v)) {
14303 PyErr_SetString(PyExc_TypeError,
14304 "* wants int");
14305 return -1;
14306 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014307 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014308 if (arg->prec == -1 && PyErr_Occurred())
14309 return -1;
14310 if (arg->prec < 0)
14311 arg->prec = 0;
14312 if (--ctx->fmtcnt >= 0) {
14313 arg->ch = FORMAT_READ(ctx);
14314 ctx->fmtpos++;
14315 }
14316 }
14317 else if (arg->ch >= '0' && arg->ch <= '9') {
14318 arg->prec = arg->ch - '0';
14319 while (--ctx->fmtcnt >= 0) {
14320 arg->ch = FORMAT_READ(ctx);
14321 ctx->fmtpos++;
14322 if (arg->ch < '0' || arg->ch > '9')
14323 break;
14324 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14325 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014326 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014327 return -1;
14328 }
14329 arg->prec = arg->prec*10 + (arg->ch - '0');
14330 }
14331 }
14332 }
14333
14334 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14335 if (ctx->fmtcnt >= 0) {
14336 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14337 if (--ctx->fmtcnt >= 0) {
14338 arg->ch = FORMAT_READ(ctx);
14339 ctx->fmtpos++;
14340 }
14341 }
14342 }
14343 if (ctx->fmtcnt < 0) {
14344 PyErr_SetString(PyExc_ValueError,
14345 "incomplete format");
14346 return -1;
14347 }
14348 return 0;
14349
14350#undef FORMAT_READ
14351}
14352
14353/* Format one argument. Supported conversion specifiers:
14354
14355 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014356 - "i", "d", "u": int or float
14357 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014358 - "e", "E", "f", "F", "g", "G": float
14359 - "c": int or str (1 character)
14360
Victor Stinner8dbd4212012-12-04 09:30:24 +010014361 When possible, the output is written directly into the Unicode writer
14362 (ctx->writer). A string is created when padding is required.
14363
Victor Stinnera47082312012-10-04 02:19:54 +020014364 Return 0 if the argument has been formatted into *p_str,
14365 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014366 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014367static int
14368unicode_format_arg_format(struct unicode_formatter_t *ctx,
14369 struct unicode_format_arg_t *arg,
14370 PyObject **p_str)
14371{
14372 PyObject *v;
14373 _PyUnicodeWriter *writer = &ctx->writer;
14374
14375 if (ctx->fmtcnt == 0)
14376 ctx->writer.overallocate = 0;
14377
14378 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014379 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014380 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014381 return 1;
14382 }
14383
14384 v = unicode_format_getnextarg(ctx);
14385 if (v == NULL)
14386 return -1;
14387
Victor Stinnera47082312012-10-04 02:19:54 +020014388
14389 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014390 case 's':
14391 case 'r':
14392 case 'a':
14393 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14394 /* Fast path */
14395 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14396 return -1;
14397 return 1;
14398 }
14399
14400 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14401 *p_str = v;
14402 Py_INCREF(*p_str);
14403 }
14404 else {
14405 if (arg->ch == 's')
14406 *p_str = PyObject_Str(v);
14407 else if (arg->ch == 'r')
14408 *p_str = PyObject_Repr(v);
14409 else
14410 *p_str = PyObject_ASCII(v);
14411 }
14412 break;
14413
14414 case 'i':
14415 case 'd':
14416 case 'u':
14417 case 'o':
14418 case 'x':
14419 case 'X':
14420 {
14421 int ret = mainformatlong(v, arg, p_str, writer);
14422 if (ret != 0)
14423 return ret;
14424 arg->sign = 1;
14425 break;
14426 }
14427
14428 case 'e':
14429 case 'E':
14430 case 'f':
14431 case 'F':
14432 case 'g':
14433 case 'G':
14434 if (arg->width == -1 && arg->prec == -1
14435 && !(arg->flags & (F_SIGN | F_BLANK)))
14436 {
14437 /* Fast path */
14438 if (formatfloat(v, arg, NULL, writer) == -1)
14439 return -1;
14440 return 1;
14441 }
14442
14443 arg->sign = 1;
14444 if (formatfloat(v, arg, p_str, NULL) == -1)
14445 return -1;
14446 break;
14447
14448 case 'c':
14449 {
14450 Py_UCS4 ch = formatchar(v);
14451 if (ch == (Py_UCS4) -1)
14452 return -1;
14453 if (arg->width == -1 && arg->prec == -1) {
14454 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014455 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014456 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014457 return 1;
14458 }
14459 *p_str = PyUnicode_FromOrdinal(ch);
14460 break;
14461 }
14462
14463 default:
14464 PyErr_Format(PyExc_ValueError,
14465 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014466 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014467 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14468 (int)arg->ch,
14469 ctx->fmtpos - 1);
14470 return -1;
14471 }
14472 if (*p_str == NULL)
14473 return -1;
14474 assert (PyUnicode_Check(*p_str));
14475 return 0;
14476}
14477
14478static int
14479unicode_format_arg_output(struct unicode_formatter_t *ctx,
14480 struct unicode_format_arg_t *arg,
14481 PyObject *str)
14482{
14483 Py_ssize_t len;
14484 enum PyUnicode_Kind kind;
14485 void *pbuf;
14486 Py_ssize_t pindex;
14487 Py_UCS4 signchar;
14488 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014489 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014490 Py_ssize_t sublen;
14491 _PyUnicodeWriter *writer = &ctx->writer;
14492 Py_UCS4 fill;
14493
14494 fill = ' ';
14495 if (arg->sign && arg->flags & F_ZERO)
14496 fill = '0';
14497
14498 if (PyUnicode_READY(str) == -1)
14499 return -1;
14500
14501 len = PyUnicode_GET_LENGTH(str);
14502 if ((arg->width == -1 || arg->width <= len)
14503 && (arg->prec == -1 || arg->prec >= len)
14504 && !(arg->flags & (F_SIGN | F_BLANK)))
14505 {
14506 /* Fast path */
14507 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14508 return -1;
14509 return 0;
14510 }
14511
14512 /* Truncate the string for "s", "r" and "a" formats
14513 if the precision is set */
14514 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14515 if (arg->prec >= 0 && len > arg->prec)
14516 len = arg->prec;
14517 }
14518
14519 /* Adjust sign and width */
14520 kind = PyUnicode_KIND(str);
14521 pbuf = PyUnicode_DATA(str);
14522 pindex = 0;
14523 signchar = '\0';
14524 if (arg->sign) {
14525 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14526 if (ch == '-' || ch == '+') {
14527 signchar = ch;
14528 len--;
14529 pindex++;
14530 }
14531 else if (arg->flags & F_SIGN)
14532 signchar = '+';
14533 else if (arg->flags & F_BLANK)
14534 signchar = ' ';
14535 else
14536 arg->sign = 0;
14537 }
14538 if (arg->width < len)
14539 arg->width = len;
14540
14541 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014542 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014543 if (!(arg->flags & F_LJUST)) {
14544 if (arg->sign) {
14545 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014546 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014547 }
14548 else {
14549 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014550 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014551 }
14552 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014553 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14554 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014555 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014556 }
14557
Victor Stinnera47082312012-10-04 02:19:54 +020014558 buflen = arg->width;
14559 if (arg->sign && len == arg->width)
14560 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014561 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014562 return -1;
14563
14564 /* Write the sign if needed */
14565 if (arg->sign) {
14566 if (fill != ' ') {
14567 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14568 writer->pos += 1;
14569 }
14570 if (arg->width > len)
14571 arg->width--;
14572 }
14573
14574 /* Write the numeric prefix for "x", "X" and "o" formats
14575 if the alternate form is used.
14576 For example, write "0x" for the "%#x" format. */
14577 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14578 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14579 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14580 if (fill != ' ') {
14581 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14582 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14583 writer->pos += 2;
14584 pindex += 2;
14585 }
14586 arg->width -= 2;
14587 if (arg->width < 0)
14588 arg->width = 0;
14589 len -= 2;
14590 }
14591
14592 /* Pad left with the fill character if needed */
14593 if (arg->width > len && !(arg->flags & F_LJUST)) {
14594 sublen = arg->width - len;
14595 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14596 writer->pos += sublen;
14597 arg->width = len;
14598 }
14599
14600 /* If padding with spaces: write sign if needed and/or numeric prefix if
14601 the alternate form is used */
14602 if (fill == ' ') {
14603 if (arg->sign) {
14604 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14605 writer->pos += 1;
14606 }
14607 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14608 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14609 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14610 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14611 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14612 writer->pos += 2;
14613 pindex += 2;
14614 }
14615 }
14616
14617 /* Write characters */
14618 if (len) {
14619 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14620 str, pindex, len);
14621 writer->pos += len;
14622 }
14623
14624 /* Pad right with the fill character if needed */
14625 if (arg->width > len) {
14626 sublen = arg->width - len;
14627 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14628 writer->pos += sublen;
14629 }
14630 return 0;
14631}
14632
14633/* Helper of PyUnicode_Format(): format one arg.
14634 Return 0 on success, raise an exception and return -1 on error. */
14635static int
14636unicode_format_arg(struct unicode_formatter_t *ctx)
14637{
14638 struct unicode_format_arg_t arg;
14639 PyObject *str;
14640 int ret;
14641
Victor Stinner8dbd4212012-12-04 09:30:24 +010014642 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14643 arg.flags = 0;
14644 arg.width = -1;
14645 arg.prec = -1;
14646 arg.sign = 0;
14647 str = NULL;
14648
Victor Stinnera47082312012-10-04 02:19:54 +020014649 ret = unicode_format_arg_parse(ctx, &arg);
14650 if (ret == -1)
14651 return -1;
14652
14653 ret = unicode_format_arg_format(ctx, &arg, &str);
14654 if (ret == -1)
14655 return -1;
14656
14657 if (ret != 1) {
14658 ret = unicode_format_arg_output(ctx, &arg, str);
14659 Py_DECREF(str);
14660 if (ret == -1)
14661 return -1;
14662 }
14663
14664 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14665 PyErr_SetString(PyExc_TypeError,
14666 "not all arguments converted during string formatting");
14667 return -1;
14668 }
14669 return 0;
14670}
14671
Alexander Belopolsky40018472011-02-26 01:02:56 +000014672PyObject *
14673PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014674{
Victor Stinnera47082312012-10-04 02:19:54 +020014675 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014676
Guido van Rossumd57fd912000-03-10 22:53:23 +000014677 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014678 PyErr_BadInternalCall();
14679 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014680 }
Victor Stinnera47082312012-10-04 02:19:54 +020014681
14682 ctx.fmtstr = PyUnicode_FromObject(format);
14683 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014684 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014685 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14686 Py_DECREF(ctx.fmtstr);
14687 return NULL;
14688 }
14689 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14690 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14691 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14692 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014693
Victor Stinner8f674cc2013-04-17 23:02:17 +020014694 _PyUnicodeWriter_Init(&ctx.writer);
14695 ctx.writer.min_length = ctx.fmtcnt + 100;
14696 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014697
Guido van Rossumd57fd912000-03-10 22:53:23 +000014698 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014699 ctx.arglen = PyTuple_Size(args);
14700 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014701 }
14702 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014703 ctx.arglen = -1;
14704 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014705 }
Victor Stinnera47082312012-10-04 02:19:54 +020014706 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014707 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014708 ctx.dict = args;
14709 else
14710 ctx.dict = NULL;
14711 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014712
Victor Stinnera47082312012-10-04 02:19:54 +020014713 while (--ctx.fmtcnt >= 0) {
14714 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014715 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014716
14717 nonfmtpos = ctx.fmtpos++;
14718 while (ctx.fmtcnt >= 0 &&
14719 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14720 ctx.fmtpos++;
14721 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014722 }
Victor Stinnera47082312012-10-04 02:19:54 +020014723 if (ctx.fmtcnt < 0) {
14724 ctx.fmtpos--;
14725 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014726 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014727
Victor Stinnercfc4c132013-04-03 01:48:39 +020014728 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14729 nonfmtpos, ctx.fmtpos) < 0)
14730 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014731 }
14732 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014733 ctx.fmtpos++;
14734 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014735 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014736 }
14737 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014738
Victor Stinnera47082312012-10-04 02:19:54 +020014739 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014740 PyErr_SetString(PyExc_TypeError,
14741 "not all arguments converted during string formatting");
14742 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014743 }
14744
Victor Stinnera47082312012-10-04 02:19:54 +020014745 if (ctx.args_owned) {
14746 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014747 }
Victor Stinnera47082312012-10-04 02:19:54 +020014748 Py_DECREF(ctx.fmtstr);
14749 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014750
Benjamin Peterson29060642009-01-31 22:14:21 +000014751 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014752 Py_DECREF(ctx.fmtstr);
14753 _PyUnicodeWriter_Dealloc(&ctx.writer);
14754 if (ctx.args_owned) {
14755 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014756 }
14757 return NULL;
14758}
14759
Jeremy Hylton938ace62002-07-17 16:30:39 +000014760static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014761unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14762
Tim Peters6d6c1a32001-08-02 04:15:00 +000014763static PyObject *
14764unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14765{
Benjamin Peterson29060642009-01-31 22:14:21 +000014766 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014767 static char *kwlist[] = {"object", "encoding", "errors", 0};
14768 char *encoding = NULL;
14769 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014770
Benjamin Peterson14339b62009-01-31 16:36:08 +000014771 if (type != &PyUnicode_Type)
14772 return unicode_subtype_new(type, args, kwds);
14773 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014774 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014775 return NULL;
14776 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014777 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014778 if (encoding == NULL && errors == NULL)
14779 return PyObject_Str(x);
14780 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014781 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014782}
14783
Guido van Rossume023fe02001-08-30 03:12:59 +000014784static PyObject *
14785unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14786{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014787 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014788 Py_ssize_t length, char_size;
14789 int share_wstr, share_utf8;
14790 unsigned int kind;
14791 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014792
Benjamin Peterson14339b62009-01-31 16:36:08 +000014793 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014794
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014795 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014796 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014797 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014798 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014799 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014800 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014801 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014802 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014803
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014804 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014805 if (self == NULL) {
14806 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014807 return NULL;
14808 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014809 kind = PyUnicode_KIND(unicode);
14810 length = PyUnicode_GET_LENGTH(unicode);
14811
14812 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014813#ifdef Py_DEBUG
14814 _PyUnicode_HASH(self) = -1;
14815#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014816 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014817#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014818 _PyUnicode_STATE(self).interned = 0;
14819 _PyUnicode_STATE(self).kind = kind;
14820 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014821 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014822 _PyUnicode_STATE(self).ready = 1;
14823 _PyUnicode_WSTR(self) = NULL;
14824 _PyUnicode_UTF8_LENGTH(self) = 0;
14825 _PyUnicode_UTF8(self) = NULL;
14826 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014827 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014828
14829 share_utf8 = 0;
14830 share_wstr = 0;
14831 if (kind == PyUnicode_1BYTE_KIND) {
14832 char_size = 1;
14833 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14834 share_utf8 = 1;
14835 }
14836 else if (kind == PyUnicode_2BYTE_KIND) {
14837 char_size = 2;
14838 if (sizeof(wchar_t) == 2)
14839 share_wstr = 1;
14840 }
14841 else {
14842 assert(kind == PyUnicode_4BYTE_KIND);
14843 char_size = 4;
14844 if (sizeof(wchar_t) == 4)
14845 share_wstr = 1;
14846 }
14847
14848 /* Ensure we won't overflow the length. */
14849 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14850 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014851 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014852 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014853 data = PyObject_MALLOC((length + 1) * char_size);
14854 if (data == NULL) {
14855 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014856 goto onError;
14857 }
14858
Victor Stinnerc3c74152011-10-02 20:39:55 +020014859 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014860 if (share_utf8) {
14861 _PyUnicode_UTF8_LENGTH(self) = length;
14862 _PyUnicode_UTF8(self) = data;
14863 }
14864 if (share_wstr) {
14865 _PyUnicode_WSTR_LENGTH(self) = length;
14866 _PyUnicode_WSTR(self) = (wchar_t *)data;
14867 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014868
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014869 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014870 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014871 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014872#ifdef Py_DEBUG
14873 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14874#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014875 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014876 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014877
14878onError:
14879 Py_DECREF(unicode);
14880 Py_DECREF(self);
14881 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014882}
14883
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014884PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014885"str(object='') -> str\n\
14886str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014887\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014888Create a new string object from the given object. If encoding or\n\
14889errors is specified, then the object must expose a data buffer\n\
14890that will be decoded using the given encoding and error handler.\n\
14891Otherwise, returns the result of object.__str__() (if defined)\n\
14892or repr(object).\n\
14893encoding defaults to sys.getdefaultencoding().\n\
14894errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014895
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014896static PyObject *unicode_iter(PyObject *seq);
14897
Guido van Rossumd57fd912000-03-10 22:53:23 +000014898PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014899 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014900 "str", /* tp_name */
14901 sizeof(PyUnicodeObject), /* tp_size */
14902 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014903 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014904 (destructor)unicode_dealloc, /* tp_dealloc */
14905 0, /* tp_print */
14906 0, /* tp_getattr */
14907 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014908 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014909 unicode_repr, /* tp_repr */
14910 &unicode_as_number, /* tp_as_number */
14911 &unicode_as_sequence, /* tp_as_sequence */
14912 &unicode_as_mapping, /* tp_as_mapping */
14913 (hashfunc) unicode_hash, /* tp_hash*/
14914 0, /* tp_call*/
14915 (reprfunc) unicode_str, /* tp_str */
14916 PyObject_GenericGetAttr, /* tp_getattro */
14917 0, /* tp_setattro */
14918 0, /* tp_as_buffer */
14919 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014920 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014921 unicode_doc, /* tp_doc */
14922 0, /* tp_traverse */
14923 0, /* tp_clear */
14924 PyUnicode_RichCompare, /* tp_richcompare */
14925 0, /* tp_weaklistoffset */
14926 unicode_iter, /* tp_iter */
14927 0, /* tp_iternext */
14928 unicode_methods, /* tp_methods */
14929 0, /* tp_members */
14930 0, /* tp_getset */
14931 &PyBaseObject_Type, /* tp_base */
14932 0, /* tp_dict */
14933 0, /* tp_descr_get */
14934 0, /* tp_descr_set */
14935 0, /* tp_dictoffset */
14936 0, /* tp_init */
14937 0, /* tp_alloc */
14938 unicode_new, /* tp_new */
14939 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014940};
14941
14942/* Initialize the Unicode implementation */
14943
Victor Stinner3a50e702011-10-18 21:21:00 +020014944int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014945{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014946 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014947 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014948 0x000A, /* LINE FEED */
14949 0x000D, /* CARRIAGE RETURN */
14950 0x001C, /* FILE SEPARATOR */
14951 0x001D, /* GROUP SEPARATOR */
14952 0x001E, /* RECORD SEPARATOR */
14953 0x0085, /* NEXT LINE */
14954 0x2028, /* LINE SEPARATOR */
14955 0x2029, /* PARAGRAPH SEPARATOR */
14956 };
14957
Fred Drakee4315f52000-05-09 19:53:39 +000014958 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014959 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014960 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014961 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014962 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014963
Guido van Rossumcacfc072002-05-24 19:01:59 +000014964 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014965 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014966
14967 /* initialize the linebreak bloom filter */
14968 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014969 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014970 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014971
Christian Heimes26532f72013-07-20 14:57:16 +020014972 if (PyType_Ready(&EncodingMapType) < 0)
14973 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014974
Benjamin Petersonc4311282012-10-30 23:21:10 -040014975 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14976 Py_FatalError("Can't initialize field name iterator type");
14977
14978 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14979 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014980
Victor Stinner3a50e702011-10-18 21:21:00 +020014981#ifdef HAVE_MBCS
14982 winver.dwOSVersionInfoSize = sizeof(winver);
14983 if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14984 PyErr_SetFromWindowsErr(0);
14985 return -1;
14986 }
14987#endif
14988 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014989}
14990
14991/* Finalize the Unicode implementation */
14992
Christian Heimesa156e092008-02-16 07:38:31 +000014993int
14994PyUnicode_ClearFreeList(void)
14995{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014996 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014997}
14998
Guido van Rossumd57fd912000-03-10 22:53:23 +000014999void
Thomas Wouters78890102000-07-22 19:25:51 +000015000_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015001{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000015002 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015003
Serhiy Storchaka05997252013-01-26 12:14:02 +020015004 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000015005
Serhiy Storchaka05997252013-01-26 12:14:02 +020015006 for (i = 0; i < 256; i++)
15007 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020015008 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000015009 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015010}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015011
Walter Dörwald16807132007-05-25 13:52:07 +000015012void
15013PyUnicode_InternInPlace(PyObject **p)
15014{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015015 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015016 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015017#ifdef Py_DEBUG
15018 assert(s != NULL);
15019 assert(_PyUnicode_CHECK(s));
15020#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015021 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015022 return;
15023#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015024 /* If it's a subclass, we don't really know what putting
15025 it in the interned dict might do. */
15026 if (!PyUnicode_CheckExact(s))
15027 return;
15028 if (PyUnicode_CHECK_INTERNED(s))
15029 return;
15030 if (interned == NULL) {
15031 interned = PyDict_New();
15032 if (interned == NULL) {
15033 PyErr_Clear(); /* Don't leave an exception */
15034 return;
15035 }
15036 }
15037 /* It might be that the GetItem call fails even
15038 though the key is present in the dictionary,
15039 namely when this happens during a stack overflow. */
15040 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010015041 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015042 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000015043
Victor Stinnerf0335102013-04-14 19:13:03 +020015044 if (t) {
15045 Py_INCREF(t);
15046 Py_DECREF(*p);
15047 *p = t;
15048 return;
15049 }
Walter Dörwald16807132007-05-25 13:52:07 +000015050
Benjamin Peterson14339b62009-01-31 16:36:08 +000015051 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010015052 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015053 PyErr_Clear();
15054 PyThreadState_GET()->recursion_critical = 0;
15055 return;
15056 }
15057 PyThreadState_GET()->recursion_critical = 0;
15058 /* The two references in interned are not counted by refcnt.
15059 The deallocator will take care of this */
15060 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015061 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015062}
15063
15064void
15065PyUnicode_InternImmortal(PyObject **p)
15066{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015067 PyUnicode_InternInPlace(p);
15068 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015069 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015070 Py_INCREF(*p);
15071 }
Walter Dörwald16807132007-05-25 13:52:07 +000015072}
15073
15074PyObject *
15075PyUnicode_InternFromString(const char *cp)
15076{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015077 PyObject *s = PyUnicode_FromString(cp);
15078 if (s == NULL)
15079 return NULL;
15080 PyUnicode_InternInPlace(&s);
15081 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015082}
15083
Alexander Belopolsky40018472011-02-26 01:02:56 +000015084void
15085_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015086{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015087 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015088 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015089 Py_ssize_t i, n;
15090 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015091
Benjamin Peterson14339b62009-01-31 16:36:08 +000015092 if (interned == NULL || !PyDict_Check(interned))
15093 return;
15094 keys = PyDict_Keys(interned);
15095 if (keys == NULL || !PyList_Check(keys)) {
15096 PyErr_Clear();
15097 return;
15098 }
Walter Dörwald16807132007-05-25 13:52:07 +000015099
Benjamin Peterson14339b62009-01-31 16:36:08 +000015100 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15101 detector, interned unicode strings are not forcibly deallocated;
15102 rather, we give them their stolen references back, and then clear
15103 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015104
Benjamin Peterson14339b62009-01-31 16:36:08 +000015105 n = PyList_GET_SIZE(keys);
15106 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015107 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015108 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015109 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015110 if (PyUnicode_READY(s) == -1) {
15111 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015112 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015113 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015114 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015115 case SSTATE_NOT_INTERNED:
15116 /* XXX Shouldn't happen */
15117 break;
15118 case SSTATE_INTERNED_IMMORTAL:
15119 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015120 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015121 break;
15122 case SSTATE_INTERNED_MORTAL:
15123 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015124 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015125 break;
15126 default:
15127 Py_FatalError("Inconsistent interned string state.");
15128 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015129 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015130 }
15131 fprintf(stderr, "total size of all interned strings: "
15132 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15133 "mortal/immortal\n", mortal_size, immortal_size);
15134 Py_DECREF(keys);
15135 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015136 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015137}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015138
15139
15140/********************* Unicode Iterator **************************/
15141
15142typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015143 PyObject_HEAD
15144 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015145 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015146} unicodeiterobject;
15147
15148static void
15149unicodeiter_dealloc(unicodeiterobject *it)
15150{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015151 _PyObject_GC_UNTRACK(it);
15152 Py_XDECREF(it->it_seq);
15153 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015154}
15155
15156static int
15157unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15158{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015159 Py_VISIT(it->it_seq);
15160 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015161}
15162
15163static PyObject *
15164unicodeiter_next(unicodeiterobject *it)
15165{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015166 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015167
Benjamin Peterson14339b62009-01-31 16:36:08 +000015168 assert(it != NULL);
15169 seq = it->it_seq;
15170 if (seq == NULL)
15171 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015172 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015174 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15175 int kind = PyUnicode_KIND(seq);
15176 void *data = PyUnicode_DATA(seq);
15177 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15178 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015179 if (item != NULL)
15180 ++it->it_index;
15181 return item;
15182 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015183
Benjamin Peterson14339b62009-01-31 16:36:08 +000015184 Py_DECREF(seq);
15185 it->it_seq = NULL;
15186 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015187}
15188
15189static PyObject *
15190unicodeiter_len(unicodeiterobject *it)
15191{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015192 Py_ssize_t len = 0;
15193 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015194 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015195 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015196}
15197
15198PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15199
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015200static PyObject *
15201unicodeiter_reduce(unicodeiterobject *it)
15202{
15203 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015204 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015205 it->it_seq, it->it_index);
15206 } else {
15207 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15208 if (u == NULL)
15209 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015210 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015211 }
15212}
15213
15214PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15215
15216static PyObject *
15217unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15218{
15219 Py_ssize_t index = PyLong_AsSsize_t(state);
15220 if (index == -1 && PyErr_Occurred())
15221 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015222 if (it->it_seq != NULL) {
15223 if (index < 0)
15224 index = 0;
15225 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15226 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15227 it->it_index = index;
15228 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015229 Py_RETURN_NONE;
15230}
15231
15232PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15233
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015234static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015235 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015236 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015237 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15238 reduce_doc},
15239 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15240 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015241 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015242};
15243
15244PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015245 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15246 "str_iterator", /* tp_name */
15247 sizeof(unicodeiterobject), /* tp_basicsize */
15248 0, /* tp_itemsize */
15249 /* methods */
15250 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15251 0, /* tp_print */
15252 0, /* tp_getattr */
15253 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015254 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015255 0, /* tp_repr */
15256 0, /* tp_as_number */
15257 0, /* tp_as_sequence */
15258 0, /* tp_as_mapping */
15259 0, /* tp_hash */
15260 0, /* tp_call */
15261 0, /* tp_str */
15262 PyObject_GenericGetAttr, /* tp_getattro */
15263 0, /* tp_setattro */
15264 0, /* tp_as_buffer */
15265 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15266 0, /* tp_doc */
15267 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15268 0, /* tp_clear */
15269 0, /* tp_richcompare */
15270 0, /* tp_weaklistoffset */
15271 PyObject_SelfIter, /* tp_iter */
15272 (iternextfunc)unicodeiter_next, /* tp_iternext */
15273 unicodeiter_methods, /* tp_methods */
15274 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015275};
15276
15277static PyObject *
15278unicode_iter(PyObject *seq)
15279{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015280 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015281
Benjamin Peterson14339b62009-01-31 16:36:08 +000015282 if (!PyUnicode_Check(seq)) {
15283 PyErr_BadInternalCall();
15284 return NULL;
15285 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015286 if (PyUnicode_READY(seq) == -1)
15287 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015288 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15289 if (it == NULL)
15290 return NULL;
15291 it->it_index = 0;
15292 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015293 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015294 _PyObject_GC_TRACK(it);
15295 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015296}
15297
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015298
15299size_t
15300Py_UNICODE_strlen(const Py_UNICODE *u)
15301{
15302 int res = 0;
15303 while(*u++)
15304 res++;
15305 return res;
15306}
15307
15308Py_UNICODE*
15309Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15310{
15311 Py_UNICODE *u = s1;
15312 while ((*u++ = *s2++));
15313 return s1;
15314}
15315
15316Py_UNICODE*
15317Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15318{
15319 Py_UNICODE *u = s1;
15320 while ((*u++ = *s2++))
15321 if (n-- == 0)
15322 break;
15323 return s1;
15324}
15325
15326Py_UNICODE*
15327Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15328{
15329 Py_UNICODE *u1 = s1;
15330 u1 += Py_UNICODE_strlen(u1);
15331 Py_UNICODE_strcpy(u1, s2);
15332 return s1;
15333}
15334
15335int
15336Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15337{
15338 while (*s1 && *s2 && *s1 == *s2)
15339 s1++, s2++;
15340 if (*s1 && *s2)
15341 return (*s1 < *s2) ? -1 : +1;
15342 if (*s1)
15343 return 1;
15344 if (*s2)
15345 return -1;
15346 return 0;
15347}
15348
15349int
15350Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15351{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015352 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015353 for (; n != 0; n--) {
15354 u1 = *s1;
15355 u2 = *s2;
15356 if (u1 != u2)
15357 return (u1 < u2) ? -1 : +1;
15358 if (u1 == '\0')
15359 return 0;
15360 s1++;
15361 s2++;
15362 }
15363 return 0;
15364}
15365
15366Py_UNICODE*
15367Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15368{
15369 const Py_UNICODE *p;
15370 for (p = s; *p; p++)
15371 if (*p == c)
15372 return (Py_UNICODE*)p;
15373 return NULL;
15374}
15375
15376Py_UNICODE*
15377Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15378{
15379 const Py_UNICODE *p;
15380 p = s + Py_UNICODE_strlen(s);
15381 while (p != s) {
15382 p--;
15383 if (*p == c)
15384 return (Py_UNICODE*)p;
15385 }
15386 return NULL;
15387}
Victor Stinner331ea922010-08-10 16:37:20 +000015388
Victor Stinner71133ff2010-09-01 23:43:53 +000015389Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015390PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015391{
Victor Stinner577db2c2011-10-11 22:12:48 +020015392 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015393 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015394
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015395 if (!PyUnicode_Check(unicode)) {
15396 PyErr_BadArgument();
15397 return NULL;
15398 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015399 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015400 if (u == NULL)
15401 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015402 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015403 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015404 PyErr_NoMemory();
15405 return NULL;
15406 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015407 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015408 size *= sizeof(Py_UNICODE);
15409 copy = PyMem_Malloc(size);
15410 if (copy == NULL) {
15411 PyErr_NoMemory();
15412 return NULL;
15413 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015414 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015415 return copy;
15416}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015417
Georg Brandl66c221e2010-10-14 07:04:07 +000015418/* A _string module, to export formatter_parser and formatter_field_name_split
15419 to the string.Formatter class implemented in Python. */
15420
15421static PyMethodDef _string_methods[] = {
15422 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15423 METH_O, PyDoc_STR("split the argument as a field name")},
15424 {"formatter_parser", (PyCFunction) formatter_parser,
15425 METH_O, PyDoc_STR("parse the argument as a format string")},
15426 {NULL, NULL}
15427};
15428
15429static struct PyModuleDef _string_module = {
15430 PyModuleDef_HEAD_INIT,
15431 "_string",
15432 PyDoc_STR("string helper module"),
15433 0,
15434 _string_methods,
15435 NULL,
15436 NULL,
15437 NULL,
15438 NULL
15439};
15440
15441PyMODINIT_FUNC
15442PyInit__string(void)
15443{
15444 return PyModule_Create(&_string_module);
15445}
15446
15447
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015448#ifdef __cplusplus
15449}
15450#endif