blob: 168f9f992391af93ec806d46815174d8fa5686ad [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Larry Hastings61272b72014-01-07 12:41:53 -080050/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080051class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080052[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080053/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080054
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055/* --- Globals ------------------------------------------------------------
56
Serhiy Storchaka05997252013-01-26 12:14:02 +020057NOTE: In the interpreter's initialization phase, some globals are currently
58 initialized dynamically as needed. In the process Unicode objects may
59 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000060
61*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000063
64#ifdef __cplusplus
65extern "C" {
66#endif
67
Victor Stinner8faf8212011-12-08 22:14:11 +010068/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
69#define MAX_UNICODE 0x10ffff
70
Victor Stinner910337b2011-10-03 03:20:16 +020071#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020072# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020073#else
74# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
75#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020076
Victor Stinnere90fe6a2011-10-01 16:48:13 +020077#define _PyUnicode_UTF8(op) \
78 (((PyCompactUnicodeObject*)(op))->utf8)
79#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020080 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 assert(PyUnicode_IS_READY(op)), \
82 PyUnicode_IS_COMPACT_ASCII(op) ? \
83 ((char*)((PyASCIIObject*)(op) + 1)) : \
84 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020085#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020086 (((PyCompactUnicodeObject*)(op))->utf8_length)
87#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020088 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020089 assert(PyUnicode_IS_READY(op)), \
90 PyUnicode_IS_COMPACT_ASCII(op) ? \
91 ((PyASCIIObject*)(op))->length : \
92 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020093#define _PyUnicode_WSTR(op) \
94 (((PyASCIIObject*)(op))->wstr)
95#define _PyUnicode_WSTR_LENGTH(op) \
96 (((PyCompactUnicodeObject*)(op))->wstr_length)
97#define _PyUnicode_LENGTH(op) \
98 (((PyASCIIObject *)(op))->length)
99#define _PyUnicode_STATE(op) \
100 (((PyASCIIObject *)(op))->state)
101#define _PyUnicode_HASH(op) \
102 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200103#define _PyUnicode_KIND(op) \
104 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200106#define _PyUnicode_GET_LENGTH(op) \
107 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_DATA_ANY(op) \
110 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200111
Victor Stinner910337b2011-10-03 03:20:16 +0200112#undef PyUnicode_READY
113#define PyUnicode_READY(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200116 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100117 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200118
Victor Stinnerc379ead2011-10-03 12:52:27 +0200119#define _PyUnicode_SHARE_UTF8(op) \
120 (assert(_PyUnicode_CHECK(op)), \
121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op) \
124 (assert(_PyUnicode_CHECK(op)), \
125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127/* true if the Unicode object has an allocated UTF-8 memory block
128 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200130 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200131 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
Victor Stinner03490912011-10-03 23:45:12 +0200134/* true if the Unicode object has an allocated wstr memory block
135 (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200137 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200138 (!PyUnicode_IS_READY(op) || \
139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100148 to_type *_to = (to_type *)(to); \
149 const from_type *_iter = (from_type *)(begin); \
150 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 Py_ssize_t n = (_end) - (_iter); \
152 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200153 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200154 while (_iter < (_unrolled_end)) { \
155 _to[0] = (to_type) _iter[0]; \
156 _to[1] = (to_type) _iter[1]; \
157 _to[2] = (to_type) _iter[2]; \
158 _to[3] = (to_type) _iter[3]; \
159 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200160 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200161 while (_iter < (_end)) \
162 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200164
Walter Dörwald16807132007-05-25 13:52:07 +0000165/* This dictionary holds all interned unicode strings. Note that references
166 to strings in this dictionary are *not* counted in the string's ob_refcnt.
167 When the interned string reaches a refcnt of 0 the string deallocation
168 function will delete the reference from this dictionary.
169
170 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000171 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000172*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200173static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000174
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200176static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200177
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200179 do { \
180 if (unicode_empty != NULL) \
181 Py_INCREF(unicode_empty); \
182 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200183 unicode_empty = PyUnicode_New(0, 0); \
184 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200185 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
187 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200189 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190
Serhiy Storchaka678db842013-01-26 12:16:36 +0200191#define _Py_RETURN_UNICODE_EMPTY() \
192 do { \
193 _Py_INCREF_UNICODE_EMPTY(); \
194 return unicode_empty; \
195 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200197/* Forward declaration */
198Py_LOCAL_INLINE(int)
199_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
200
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200201/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200203
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200206static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100242static int unicode_modifiable(PyObject *unicode);
243
Victor Stinnerfe226c02011-10-03 03:52:20 +0200244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100246_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200247static PyObject *
248_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
249static PyObject *
250_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
251
252static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000253unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000254 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100255 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000256 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
257
Alexander Belopolsky40018472011-02-26 01:02:56 +0000258static void
259raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300260 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100261 PyObject *unicode,
262 Py_ssize_t startpos, Py_ssize_t endpos,
263 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000264
Christian Heimes190d79e2008-01-30 11:58:22 +0000265/* Same for linebreaks */
266static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000268/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000269/* 0x000B, * LINE TABULATION */
270/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000271/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000272 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000274/* 0x001C, * FILE SEPARATOR */
275/* 0x001D, * GROUP SEPARATOR */
276/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 0, 0, 0, 0, 1, 1, 1, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000282
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000291};
292
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300293#include "clinic/unicodeobject.c.h"
294
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300295/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
296 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000297Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000298PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000299{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000300#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000302#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000303 /* This is actually an illegal character, so it should
304 not be passed to unichr. */
305 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000306#endif
307}
308
Victor Stinner910337b2011-10-03 03:20:16 +0200309#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200310int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100311_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200312{
313 PyASCIIObject *ascii;
314 unsigned int kind;
315
316 assert(PyUnicode_Check(op));
317
318 ascii = (PyASCIIObject *)op;
319 kind = ascii->state.kind;
320
Victor Stinnera3b334d2011-10-03 13:53:37 +0200321 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200322 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200323 assert(ascii->state.ready == 1);
324 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200325 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200326 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200327 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200328
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 if (ascii->state.compact == 1) {
330 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200331 assert(kind == PyUnicode_1BYTE_KIND
332 || kind == PyUnicode_2BYTE_KIND
333 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200335 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200336 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100337 }
338 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200339 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
340
341 data = unicode->data.any;
342 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100343 assert(ascii->length == 0);
344 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200345 assert(ascii->state.compact == 0);
346 assert(ascii->state.ascii == 0);
347 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100348 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200349 assert(ascii->wstr != NULL);
350 assert(data == NULL);
351 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200352 }
353 else {
354 assert(kind == PyUnicode_1BYTE_KIND
355 || kind == PyUnicode_2BYTE_KIND
356 || kind == PyUnicode_4BYTE_KIND);
357 assert(ascii->state.compact == 0);
358 assert(ascii->state.ready == 1);
359 assert(data != NULL);
360 if (ascii->state.ascii) {
361 assert (compact->utf8 == data);
362 assert (compact->utf8_length == ascii->length);
363 }
364 else
365 assert (compact->utf8 != data);
366 }
367 }
368 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200369 if (
370#if SIZEOF_WCHAR_T == 2
371 kind == PyUnicode_2BYTE_KIND
372#else
373 kind == PyUnicode_4BYTE_KIND
374#endif
375 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200376 {
377 assert(ascii->wstr == data);
378 assert(compact->wstr_length == ascii->length);
379 } else
380 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200381 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200382
383 if (compact->utf8 == NULL)
384 assert(compact->utf8_length == 0);
385 if (ascii->wstr == NULL)
386 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200387 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200388 /* check that the best kind is used */
389 if (check_content && kind != PyUnicode_WCHAR_KIND)
390 {
391 Py_ssize_t i;
392 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200393 void *data;
394 Py_UCS4 ch;
395
396 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200397 for (i=0; i < ascii->length; i++)
398 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200399 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 if (ch > maxchar)
401 maxchar = ch;
402 }
403 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100404 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200405 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100406 assert(maxchar <= 255);
407 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200408 else
409 assert(maxchar < 128);
410 }
Victor Stinner77faf692011-11-20 18:56:05 +0100411 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200412 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100413 assert(maxchar <= 0xFFFF);
414 }
415 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200416 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100417 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100418 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200419 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200420 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400421 return 1;
422}
Victor Stinner910337b2011-10-03 03:20:16 +0200423#endif
424
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100425static PyObject*
426unicode_result_wchar(PyObject *unicode)
427{
428#ifndef Py_DEBUG
429 Py_ssize_t len;
430
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100431 len = _PyUnicode_WSTR_LENGTH(unicode);
432 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200434 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100435 }
436
437 if (len == 1) {
438 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100439 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100440 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
441 Py_DECREF(unicode);
442 return latin1_char;
443 }
444 }
445
446 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200447 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100448 return NULL;
449 }
450#else
Victor Stinneraa771272012-10-04 02:32:58 +0200451 assert(Py_REFCNT(unicode) == 1);
452
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100453 /* don't make the result ready in debug mode to ensure that the caller
454 makes the string ready before using it */
455 assert(_PyUnicode_CheckConsistency(unicode, 1));
456#endif
457 return unicode;
458}
459
460static PyObject*
461unicode_result_ready(PyObject *unicode)
462{
463 Py_ssize_t length;
464
465 length = PyUnicode_GET_LENGTH(unicode);
466 if (length == 0) {
467 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100468 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200469 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100470 }
471 return unicode_empty;
472 }
473
474 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200475 void *data = PyUnicode_DATA(unicode);
476 int kind = PyUnicode_KIND(unicode);
477 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100478 if (ch < 256) {
479 PyObject *latin1_char = unicode_latin1[ch];
480 if (latin1_char != NULL) {
481 if (unicode != latin1_char) {
482 Py_INCREF(latin1_char);
483 Py_DECREF(unicode);
484 }
485 return latin1_char;
486 }
487 else {
488 assert(_PyUnicode_CheckConsistency(unicode, 1));
489 Py_INCREF(unicode);
490 unicode_latin1[ch] = unicode;
491 return unicode;
492 }
493 }
494 }
495
496 assert(_PyUnicode_CheckConsistency(unicode, 1));
497 return unicode;
498}
499
500static PyObject*
501unicode_result(PyObject *unicode)
502{
503 assert(_PyUnicode_CHECK(unicode));
504 if (PyUnicode_IS_READY(unicode))
505 return unicode_result_ready(unicode);
506 else
507 return unicode_result_wchar(unicode);
508}
509
Victor Stinnerc4b49542011-12-11 22:44:26 +0100510static PyObject*
511unicode_result_unchanged(PyObject *unicode)
512{
513 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500514 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100515 return NULL;
516 Py_INCREF(unicode);
517 return unicode;
518 }
519 else
520 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100521 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100522}
523
Thomas Wouters477c8d52006-05-27 19:21:47 +0000524/* --- Bloom Filters ----------------------------------------------------- */
525
526/* stuff to implement simple "bloom filters" for Unicode characters.
527 to keep things simple, we use a single bitmask, using the least 5
528 bits from each unicode characters as the bit index. */
529
530/* the linebreak mask is set up by Unicode_Init below */
531
Antoine Pitrouf068f942010-01-13 14:19:12 +0000532#if LONG_BIT >= 128
533#define BLOOM_WIDTH 128
534#elif LONG_BIT >= 64
535#define BLOOM_WIDTH 64
536#elif LONG_BIT >= 32
537#define BLOOM_WIDTH 32
538#else
539#error "LONG_BIT is smaller than 32"
540#endif
541
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542#define BLOOM_MASK unsigned long
543
Serhiy Storchaka05997252013-01-26 12:14:02 +0200544static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000545
Antoine Pitrouf068f942010-01-13 14:19:12 +0000546#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
Benjamin Peterson29060642009-01-31 22:14:21 +0000548#define BLOOM_LINEBREAK(ch) \
549 ((ch) < 128U ? ascii_linebreak[(ch)] : \
550 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
Alexander Belopolsky40018472011-02-26 01:02:56 +0000552Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200553make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000554{
Victor Stinnera85af502013-04-09 21:53:54 +0200555#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
556 do { \
557 TYPE *data = (TYPE *)PTR; \
558 TYPE *end = data + LEN; \
559 Py_UCS4 ch; \
560 for (; data != end; data++) { \
561 ch = *data; \
562 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
563 } \
564 break; \
565 } while (0)
566
Thomas Wouters477c8d52006-05-27 19:21:47 +0000567 /* calculate simple bloom-style bitmask for a given unicode string */
568
Antoine Pitrouf068f942010-01-13 14:19:12 +0000569 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000570
571 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200572 switch (kind) {
573 case PyUnicode_1BYTE_KIND:
574 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
575 break;
576 case PyUnicode_2BYTE_KIND:
577 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
578 break;
579 case PyUnicode_4BYTE_KIND:
580 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
581 break;
582 default:
583 assert(0);
584 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000585 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200586
587#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000588}
589
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200590/* Compilation of templated routines */
591
592#include "stringlib/asciilib.h"
593#include "stringlib/fastsearch.h"
594#include "stringlib/partition.h"
595#include "stringlib/split.h"
596#include "stringlib/count.h"
597#include "stringlib/find.h"
598#include "stringlib/find_max_char.h"
599#include "stringlib/localeutil.h"
600#include "stringlib/undef.h"
601
602#include "stringlib/ucs1lib.h"
603#include "stringlib/fastsearch.h"
604#include "stringlib/partition.h"
605#include "stringlib/split.h"
606#include "stringlib/count.h"
607#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300608#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200609#include "stringlib/find_max_char.h"
610#include "stringlib/localeutil.h"
611#include "stringlib/undef.h"
612
613#include "stringlib/ucs2lib.h"
614#include "stringlib/fastsearch.h"
615#include "stringlib/partition.h"
616#include "stringlib/split.h"
617#include "stringlib/count.h"
618#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300619#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200620#include "stringlib/find_max_char.h"
621#include "stringlib/localeutil.h"
622#include "stringlib/undef.h"
623
624#include "stringlib/ucs4lib.h"
625#include "stringlib/fastsearch.h"
626#include "stringlib/partition.h"
627#include "stringlib/split.h"
628#include "stringlib/count.h"
629#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300630#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200631#include "stringlib/find_max_char.h"
632#include "stringlib/localeutil.h"
633#include "stringlib/undef.h"
634
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200635#include "stringlib/unicodedefs.h"
636#include "stringlib/fastsearch.h"
637#include "stringlib/count.h"
638#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100639#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200640
Guido van Rossumd57fd912000-03-10 22:53:23 +0000641/* --- Unicode Object ----------------------------------------------------- */
642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200643static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200644fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645
Serhiy Storchakad9d769f2015-03-24 21:55:47 +0200646Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind,
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200647 Py_ssize_t size, Py_UCS4 ch,
648 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200649{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200650 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
651
652 switch (kind) {
653 case PyUnicode_1BYTE_KIND:
654 {
655 Py_UCS1 ch1 = (Py_UCS1) ch;
656 if (ch1 == ch)
657 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
658 else
659 return -1;
660 }
661 case PyUnicode_2BYTE_KIND:
662 {
663 Py_UCS2 ch2 = (Py_UCS2) ch;
664 if (ch2 == ch)
665 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
666 else
667 return -1;
668 }
669 case PyUnicode_4BYTE_KIND:
670 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
671 default:
672 assert(0);
673 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200674 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200675}
676
Victor Stinnerafffce42012-10-03 23:03:17 +0200677#ifdef Py_DEBUG
678/* Fill the data of an Unicode string with invalid characters to detect bugs
679 earlier.
680
681 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
682 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
683 invalid character in Unicode 6.0. */
684static void
685unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
686{
687 int kind = PyUnicode_KIND(unicode);
688 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
689 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
690 if (length <= old_length)
691 return;
692 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
693}
694#endif
695
Victor Stinnerfe226c02011-10-03 03:52:20 +0200696static PyObject*
697resize_compact(PyObject *unicode, Py_ssize_t length)
698{
699 Py_ssize_t char_size;
700 Py_ssize_t struct_size;
701 Py_ssize_t new_size;
702 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100703 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200704#ifdef Py_DEBUG
705 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
706#endif
707
Victor Stinner79891572012-05-03 13:43:07 +0200708 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200709 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100710 assert(PyUnicode_IS_COMPACT(unicode));
711
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200712 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100713 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200714 struct_size = sizeof(PyASCIIObject);
715 else
716 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200717 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718
Victor Stinnerfe226c02011-10-03 03:52:20 +0200719 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
720 PyErr_NoMemory();
721 return NULL;
722 }
723 new_size = (struct_size + (length + 1) * char_size);
724
Victor Stinner84def372011-12-11 20:04:56 +0100725 _Py_DEC_REFTOTAL;
726 _Py_ForgetReference(unicode);
727
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300728 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100729 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100730 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200731 PyErr_NoMemory();
732 return NULL;
733 }
Victor Stinner84def372011-12-11 20:04:56 +0100734 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200735 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100736
Victor Stinnerfe226c02011-10-03 03:52:20 +0200737 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200738 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200739 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100740 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200741 _PyUnicode_WSTR_LENGTH(unicode) = length;
742 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100743 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
744 PyObject_DEL(_PyUnicode_WSTR(unicode));
745 _PyUnicode_WSTR(unicode) = NULL;
746 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200747#ifdef Py_DEBUG
748 unicode_fill_invalid(unicode, old_length);
749#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200750 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
751 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200752 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200753 return unicode;
754}
755
Alexander Belopolsky40018472011-02-26 01:02:56 +0000756static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200757resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000758{
Victor Stinner95663112011-10-04 01:03:50 +0200759 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100760 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200761 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200762 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000763
Victor Stinnerfe226c02011-10-03 03:52:20 +0200764 if (PyUnicode_IS_READY(unicode)) {
765 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200766 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200767 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200768#ifdef Py_DEBUG
769 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
770#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200771
772 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200773 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200774 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
775 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200776
777 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
778 PyErr_NoMemory();
779 return -1;
780 }
781 new_size = (length + 1) * char_size;
782
Victor Stinner7a9105a2011-12-12 00:13:42 +0100783 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
784 {
785 PyObject_DEL(_PyUnicode_UTF8(unicode));
786 _PyUnicode_UTF8(unicode) = NULL;
787 _PyUnicode_UTF8_LENGTH(unicode) = 0;
788 }
789
Victor Stinnerfe226c02011-10-03 03:52:20 +0200790 data = (PyObject *)PyObject_REALLOC(data, new_size);
791 if (data == NULL) {
792 PyErr_NoMemory();
793 return -1;
794 }
795 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200796 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200797 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200798 _PyUnicode_WSTR_LENGTH(unicode) = length;
799 }
800 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200801 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200802 _PyUnicode_UTF8_LENGTH(unicode) = length;
803 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200804 _PyUnicode_LENGTH(unicode) = length;
805 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200806#ifdef Py_DEBUG
807 unicode_fill_invalid(unicode, old_length);
808#endif
Victor Stinner95663112011-10-04 01:03:50 +0200809 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200810 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200811 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200812 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200813 }
Victor Stinner95663112011-10-04 01:03:50 +0200814 assert(_PyUnicode_WSTR(unicode) != NULL);
815
816 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -0700817 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +0200818 PyErr_NoMemory();
819 return -1;
820 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100821 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200822 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100823 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200824 if (!wstr) {
825 PyErr_NoMemory();
826 return -1;
827 }
828 _PyUnicode_WSTR(unicode) = wstr;
829 _PyUnicode_WSTR(unicode)[length] = 0;
830 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200831 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000832 return 0;
833}
834
Victor Stinnerfe226c02011-10-03 03:52:20 +0200835static PyObject*
836resize_copy(PyObject *unicode, Py_ssize_t length)
837{
838 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100839 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200840 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100841
Benjamin Petersonbac79492012-01-14 13:34:47 -0500842 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100843 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200844
845 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
846 if (copy == NULL)
847 return NULL;
848
849 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200850 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200851 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200852 }
853 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200854 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100855
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200856 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200857 if (w == NULL)
858 return NULL;
859 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
860 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200861 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
862 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200863 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200864 }
865}
866
Guido van Rossumd57fd912000-03-10 22:53:23 +0000867/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000868 Ux0000 terminated; some code (e.g. new_identifier)
869 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000870
871 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000872 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000873
874*/
875
Alexander Belopolsky40018472011-02-26 01:02:56 +0000876static PyUnicodeObject *
877_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000878{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200879 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200880 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000881
Thomas Wouters477c8d52006-05-27 19:21:47 +0000882 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000883 if (length == 0 && unicode_empty != NULL) {
884 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200885 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000886 }
887
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000888 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -0700889 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000890 return (PyUnicodeObject *)PyErr_NoMemory();
891 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200892 if (length < 0) {
893 PyErr_SetString(PyExc_SystemError,
894 "Negative size passed to _PyUnicode_New");
895 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000896 }
897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200898 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
899 if (unicode == NULL)
900 return NULL;
901 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +0100902
903 _PyUnicode_WSTR_LENGTH(unicode) = length;
904 _PyUnicode_HASH(unicode) = -1;
905 _PyUnicode_STATE(unicode).interned = 0;
906 _PyUnicode_STATE(unicode).kind = 0;
907 _PyUnicode_STATE(unicode).compact = 0;
908 _PyUnicode_STATE(unicode).ready = 0;
909 _PyUnicode_STATE(unicode).ascii = 0;
910 _PyUnicode_DATA_ANY(unicode) = NULL;
911 _PyUnicode_LENGTH(unicode) = 0;
912 _PyUnicode_UTF8(unicode) = NULL;
913 _PyUnicode_UTF8_LENGTH(unicode) = 0;
914
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200915 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
916 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100917 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000918 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100919 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000920 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200921
Jeremy Hyltond8082792003-09-16 19:41:39 +0000922 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000923 * the caller fails before initializing str -- unicode_resize()
924 * reads str[0], and the Keep-Alive optimization can keep memory
925 * allocated for str alive across a call to unicode_dealloc(unicode).
926 * We don't want unicode_resize to read uninitialized memory in
927 * that case.
928 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200929 _PyUnicode_WSTR(unicode)[0] = 0;
930 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +0100931
Victor Stinner7931d9a2011-11-04 00:22:48 +0100932 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000933 return unicode;
934}
935
Victor Stinnerf42dc442011-10-02 23:33:16 +0200936static const char*
937unicode_kind_name(PyObject *unicode)
938{
Victor Stinner42dfd712011-10-03 14:41:45 +0200939 /* don't check consistency: unicode_kind_name() is called from
940 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200941 if (!PyUnicode_IS_COMPACT(unicode))
942 {
943 if (!PyUnicode_IS_READY(unicode))
944 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600945 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200946 {
947 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200948 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200949 return "legacy ascii";
950 else
951 return "legacy latin1";
952 case PyUnicode_2BYTE_KIND:
953 return "legacy UCS2";
954 case PyUnicode_4BYTE_KIND:
955 return "legacy UCS4";
956 default:
957 return "<legacy invalid kind>";
958 }
959 }
960 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600961 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200962 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200963 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200964 return "ascii";
965 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200966 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200967 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200968 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200969 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200970 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200971 default:
972 return "<invalid compact kind>";
973 }
974}
975
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200976#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200977/* Functions wrapping macros for use in debugger */
978char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200979 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200980}
981
982void *_PyUnicode_compact_data(void *unicode) {
983 return _PyUnicode_COMPACT_DATA(unicode);
984}
985void *_PyUnicode_data(void *unicode){
986 printf("obj %p\n", unicode);
987 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
988 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
989 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
990 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
991 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
992 return PyUnicode_DATA(unicode);
993}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200994
995void
996_PyUnicode_Dump(PyObject *op)
997{
998 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +0200999 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1000 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1001 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001002
Victor Stinnera849a4b2011-10-03 12:12:11 +02001003 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001004 {
1005 if (ascii->state.ascii)
1006 data = (ascii + 1);
1007 else
1008 data = (compact + 1);
1009 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001010 else
1011 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001012 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1013 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001014
Victor Stinnera849a4b2011-10-03 12:12:11 +02001015 if (ascii->wstr == data)
1016 printf("shared ");
1017 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001018
Victor Stinnera3b334d2011-10-03 13:53:37 +02001019 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001020 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001021 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1022 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001023 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1024 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001025 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001026 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001027}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001028#endif
1029
1030PyObject *
1031PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1032{
1033 PyObject *obj;
1034 PyCompactUnicodeObject *unicode;
1035 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001036 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001037 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001038 Py_ssize_t char_size;
1039 Py_ssize_t struct_size;
1040
1041 /* Optimization for empty strings */
1042 if (size == 0 && unicode_empty != NULL) {
1043 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001044 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001045 }
1046
Victor Stinner9e9d6892011-10-04 01:02:02 +02001047 is_ascii = 0;
1048 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001049 struct_size = sizeof(PyCompactUnicodeObject);
1050 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001051 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001052 char_size = 1;
1053 is_ascii = 1;
1054 struct_size = sizeof(PyASCIIObject);
1055 }
1056 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001057 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001058 char_size = 1;
1059 }
1060 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001061 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001062 char_size = 2;
1063 if (sizeof(wchar_t) == 2)
1064 is_sharing = 1;
1065 }
1066 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001067 if (maxchar > MAX_UNICODE) {
1068 PyErr_SetString(PyExc_SystemError,
1069 "invalid maximum character passed to PyUnicode_New");
1070 return NULL;
1071 }
Victor Stinner8f825062012-04-27 13:55:39 +02001072 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001073 char_size = 4;
1074 if (sizeof(wchar_t) == 4)
1075 is_sharing = 1;
1076 }
1077
1078 /* Ensure we won't overflow the size. */
1079 if (size < 0) {
1080 PyErr_SetString(PyExc_SystemError,
1081 "Negative size passed to PyUnicode_New");
1082 return NULL;
1083 }
1084 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1085 return PyErr_NoMemory();
1086
1087 /* Duplicated allocation code from _PyObject_New() instead of a call to
1088 * PyObject_New() so we are able to allocate space for the object and
1089 * it's data buffer.
1090 */
1091 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1092 if (obj == NULL)
1093 return PyErr_NoMemory();
1094 obj = PyObject_INIT(obj, &PyUnicode_Type);
1095 if (obj == NULL)
1096 return NULL;
1097
1098 unicode = (PyCompactUnicodeObject *)obj;
1099 if (is_ascii)
1100 data = ((PyASCIIObject*)obj) + 1;
1101 else
1102 data = unicode + 1;
1103 _PyUnicode_LENGTH(unicode) = size;
1104 _PyUnicode_HASH(unicode) = -1;
1105 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001106 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001107 _PyUnicode_STATE(unicode).compact = 1;
1108 _PyUnicode_STATE(unicode).ready = 1;
1109 _PyUnicode_STATE(unicode).ascii = is_ascii;
1110 if (is_ascii) {
1111 ((char*)data)[size] = 0;
1112 _PyUnicode_WSTR(unicode) = NULL;
1113 }
Victor Stinner8f825062012-04-27 13:55:39 +02001114 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001115 ((char*)data)[size] = 0;
1116 _PyUnicode_WSTR(unicode) = NULL;
1117 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001118 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001119 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001120 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001121 else {
1122 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001123 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001124 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001125 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001126 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001127 ((Py_UCS4*)data)[size] = 0;
1128 if (is_sharing) {
1129 _PyUnicode_WSTR_LENGTH(unicode) = size;
1130 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1131 }
1132 else {
1133 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1134 _PyUnicode_WSTR(unicode) = NULL;
1135 }
1136 }
Victor Stinner8f825062012-04-27 13:55:39 +02001137#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001138 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001139#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001140 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001141 return obj;
1142}
1143
1144#if SIZEOF_WCHAR_T == 2
1145/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1146 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001147 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001148
1149 This function assumes that unicode can hold one more code point than wstr
1150 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001151static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001152unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001153 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001154{
1155 const wchar_t *iter;
1156 Py_UCS4 *ucs4_out;
1157
Victor Stinner910337b2011-10-03 03:20:16 +02001158 assert(unicode != NULL);
1159 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001160 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1161 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1162
1163 for (iter = begin; iter < end; ) {
1164 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1165 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001166 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1167 && (iter+1) < end
1168 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001169 {
Victor Stinner551ac952011-11-29 22:58:13 +01001170 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001171 iter += 2;
1172 }
1173 else {
1174 *ucs4_out++ = *iter;
1175 iter++;
1176 }
1177 }
1178 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1179 _PyUnicode_GET_LENGTH(unicode)));
1180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001181}
1182#endif
1183
Victor Stinnercd9950f2011-10-02 00:34:53 +02001184static int
Victor Stinner488fa492011-12-12 00:01:39 +01001185unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001186{
Victor Stinner488fa492011-12-12 00:01:39 +01001187 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001188 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001189 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001190 return -1;
1191 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001192 return 0;
1193}
1194
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001195static int
1196_copy_characters(PyObject *to, Py_ssize_t to_start,
1197 PyObject *from, Py_ssize_t from_start,
1198 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001199{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001200 unsigned int from_kind, to_kind;
1201 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001202
Victor Stinneree4544c2012-05-09 22:24:08 +02001203 assert(0 <= how_many);
1204 assert(0 <= from_start);
1205 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001206 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001207 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001208 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001209
Victor Stinnerd3f08822012-05-29 12:57:52 +02001210 assert(PyUnicode_Check(to));
1211 assert(PyUnicode_IS_READY(to));
1212 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1213
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001214 if (how_many == 0)
1215 return 0;
1216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001217 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001218 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001219 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001220 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001221
Victor Stinnerf1852262012-06-16 16:38:26 +02001222#ifdef Py_DEBUG
1223 if (!check_maxchar
1224 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1225 {
1226 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1227 Py_UCS4 ch;
1228 Py_ssize_t i;
1229 for (i=0; i < how_many; i++) {
1230 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1231 assert(ch <= to_maxchar);
1232 }
1233 }
1234#endif
1235
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001236 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001237 if (check_maxchar
1238 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1239 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001240 /* Writing Latin-1 characters into an ASCII string requires to
1241 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001242 Py_UCS4 max_char;
1243 max_char = ucs1lib_find_max_char(from_data,
1244 (Py_UCS1*)from_data + how_many);
1245 if (max_char >= 128)
1246 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001247 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001248 Py_MEMCPY((char*)to_data + to_kind * to_start,
1249 (char*)from_data + from_kind * from_start,
1250 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001252 else if (from_kind == PyUnicode_1BYTE_KIND
1253 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001254 {
1255 _PyUnicode_CONVERT_BYTES(
1256 Py_UCS1, Py_UCS2,
1257 PyUnicode_1BYTE_DATA(from) + from_start,
1258 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1259 PyUnicode_2BYTE_DATA(to) + to_start
1260 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001261 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001262 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001263 && to_kind == PyUnicode_4BYTE_KIND)
1264 {
1265 _PyUnicode_CONVERT_BYTES(
1266 Py_UCS1, Py_UCS4,
1267 PyUnicode_1BYTE_DATA(from) + from_start,
1268 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1269 PyUnicode_4BYTE_DATA(to) + to_start
1270 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001271 }
1272 else if (from_kind == PyUnicode_2BYTE_KIND
1273 && to_kind == PyUnicode_4BYTE_KIND)
1274 {
1275 _PyUnicode_CONVERT_BYTES(
1276 Py_UCS2, Py_UCS4,
1277 PyUnicode_2BYTE_DATA(from) + from_start,
1278 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1279 PyUnicode_4BYTE_DATA(to) + to_start
1280 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001281 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001282 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001283 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1284
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001285 if (!check_maxchar) {
1286 if (from_kind == PyUnicode_2BYTE_KIND
1287 && to_kind == PyUnicode_1BYTE_KIND)
1288 {
1289 _PyUnicode_CONVERT_BYTES(
1290 Py_UCS2, Py_UCS1,
1291 PyUnicode_2BYTE_DATA(from) + from_start,
1292 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1293 PyUnicode_1BYTE_DATA(to) + to_start
1294 );
1295 }
1296 else if (from_kind == PyUnicode_4BYTE_KIND
1297 && to_kind == PyUnicode_1BYTE_KIND)
1298 {
1299 _PyUnicode_CONVERT_BYTES(
1300 Py_UCS4, Py_UCS1,
1301 PyUnicode_4BYTE_DATA(from) + from_start,
1302 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1303 PyUnicode_1BYTE_DATA(to) + to_start
1304 );
1305 }
1306 else if (from_kind == PyUnicode_4BYTE_KIND
1307 && to_kind == PyUnicode_2BYTE_KIND)
1308 {
1309 _PyUnicode_CONVERT_BYTES(
1310 Py_UCS4, Py_UCS2,
1311 PyUnicode_4BYTE_DATA(from) + from_start,
1312 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1313 PyUnicode_2BYTE_DATA(to) + to_start
1314 );
1315 }
1316 else {
1317 assert(0);
1318 return -1;
1319 }
1320 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001321 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001322 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001323 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001324 Py_ssize_t i;
1325
Victor Stinnera0702ab2011-09-29 14:14:38 +02001326 for (i=0; i < how_many; i++) {
1327 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001328 if (ch > to_maxchar)
1329 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001330 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1331 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001332 }
1333 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001334 return 0;
1335}
1336
Victor Stinnerd3f08822012-05-29 12:57:52 +02001337void
1338_PyUnicode_FastCopyCharacters(
1339 PyObject *to, Py_ssize_t to_start,
1340 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001341{
1342 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1343}
1344
1345Py_ssize_t
1346PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1347 PyObject *from, Py_ssize_t from_start,
1348 Py_ssize_t how_many)
1349{
1350 int err;
1351
1352 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1353 PyErr_BadInternalCall();
1354 return -1;
1355 }
1356
Benjamin Petersonbac79492012-01-14 13:34:47 -05001357 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001358 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001359 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001360 return -1;
1361
Victor Stinnerd3f08822012-05-29 12:57:52 +02001362 if (from_start < 0) {
1363 PyErr_SetString(PyExc_IndexError, "string index out of range");
1364 return -1;
1365 }
1366 if (to_start < 0) {
1367 PyErr_SetString(PyExc_IndexError, "string index out of range");
1368 return -1;
1369 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001370 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1371 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1372 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001373 "Cannot write %zi characters at %zi "
1374 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001375 how_many, to_start, PyUnicode_GET_LENGTH(to));
1376 return -1;
1377 }
1378
1379 if (how_many == 0)
1380 return 0;
1381
Victor Stinner488fa492011-12-12 00:01:39 +01001382 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001383 return -1;
1384
1385 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1386 if (err) {
1387 PyErr_Format(PyExc_SystemError,
1388 "Cannot copy %s characters "
1389 "into a string of %s characters",
1390 unicode_kind_name(from),
1391 unicode_kind_name(to));
1392 return -1;
1393 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001394 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395}
1396
Victor Stinner17222162011-09-28 22:15:37 +02001397/* Find the maximum code point and count the number of surrogate pairs so a
1398 correct string length can be computed before converting a string to UCS4.
1399 This function counts single surrogates as a character and not as a pair.
1400
1401 Return 0 on success, or -1 on error. */
1402static int
1403find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1404 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405{
1406 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001407 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408
Victor Stinnerc53be962011-10-02 21:33:54 +02001409 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410 *num_surrogates = 0;
1411 *maxchar = 0;
1412
1413 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001415 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1416 && (iter+1) < end
1417 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1418 {
1419 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1420 ++(*num_surrogates);
1421 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422 }
1423 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001425 {
1426 ch = *iter;
1427 iter++;
1428 }
1429 if (ch > *maxchar) {
1430 *maxchar = ch;
1431 if (*maxchar > MAX_UNICODE) {
1432 PyErr_Format(PyExc_ValueError,
1433 "character U+%x is not in range [U+0000; U+10ffff]",
1434 ch);
1435 return -1;
1436 }
1437 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 }
1439 return 0;
1440}
1441
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001442int
1443_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001444{
1445 wchar_t *end;
1446 Py_UCS4 maxchar = 0;
1447 Py_ssize_t num_surrogates;
1448#if SIZEOF_WCHAR_T == 2
1449 Py_ssize_t length_wo_surrogates;
1450#endif
1451
Georg Brandl7597add2011-10-05 16:36:47 +02001452 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001453 strings were created using _PyObject_New() and where no canonical
1454 representation (the str field) has been set yet aka strings
1455 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001456 assert(_PyUnicode_CHECK(unicode));
1457 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001459 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001460 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001461 /* Actually, it should neither be interned nor be anything else: */
1462 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001465 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001466 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468
1469 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001470 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1471 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 PyErr_NoMemory();
1473 return -1;
1474 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001475 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476 _PyUnicode_WSTR(unicode), end,
1477 PyUnicode_1BYTE_DATA(unicode));
1478 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1479 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1480 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1481 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001482 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001483 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001484 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485 }
1486 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001487 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001488 _PyUnicode_UTF8(unicode) = NULL;
1489 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 }
1491 PyObject_FREE(_PyUnicode_WSTR(unicode));
1492 _PyUnicode_WSTR(unicode) = NULL;
1493 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1494 }
1495 /* In this case we might have to convert down from 4-byte native
1496 wchar_t to 2-byte unicode. */
1497 else if (maxchar < 65536) {
1498 assert(num_surrogates == 0 &&
1499 "FindMaxCharAndNumSurrogatePairs() messed up");
1500
Victor Stinner506f5922011-09-28 22:34:18 +02001501#if SIZEOF_WCHAR_T == 2
1502 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001503 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001504 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1505 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1506 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001507 _PyUnicode_UTF8(unicode) = NULL;
1508 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001509#else
1510 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001511 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001512 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001513 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001514 PyErr_NoMemory();
1515 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001516 }
Victor Stinner506f5922011-09-28 22:34:18 +02001517 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1518 _PyUnicode_WSTR(unicode), end,
1519 PyUnicode_2BYTE_DATA(unicode));
1520 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1521 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1522 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001523 _PyUnicode_UTF8(unicode) = NULL;
1524 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001525 PyObject_FREE(_PyUnicode_WSTR(unicode));
1526 _PyUnicode_WSTR(unicode) = NULL;
1527 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1528#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001529 }
1530 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1531 else {
1532#if SIZEOF_WCHAR_T == 2
1533 /* in case the native representation is 2-bytes, we need to allocate a
1534 new normalized 4-byte version. */
1535 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001536 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1537 PyErr_NoMemory();
1538 return -1;
1539 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001540 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1541 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001542 PyErr_NoMemory();
1543 return -1;
1544 }
1545 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1546 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001547 _PyUnicode_UTF8(unicode) = NULL;
1548 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001549 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1550 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001551 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001552 PyObject_FREE(_PyUnicode_WSTR(unicode));
1553 _PyUnicode_WSTR(unicode) = NULL;
1554 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1555#else
1556 assert(num_surrogates == 0);
1557
Victor Stinnerc3c74152011-10-02 20:39:55 +02001558 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001559 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001560 _PyUnicode_UTF8(unicode) = NULL;
1561 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001562 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1563#endif
1564 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1565 }
1566 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001567 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001568 return 0;
1569}
1570
Alexander Belopolsky40018472011-02-26 01:02:56 +00001571static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001572unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001573{
Walter Dörwald16807132007-05-25 13:52:07 +00001574 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001575 case SSTATE_NOT_INTERNED:
1576 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001577
Benjamin Peterson29060642009-01-31 22:14:21 +00001578 case SSTATE_INTERNED_MORTAL:
1579 /* revive dead object temporarily for DelItem */
1580 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001581 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001582 Py_FatalError(
1583 "deletion of interned string failed");
1584 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001585
Benjamin Peterson29060642009-01-31 22:14:21 +00001586 case SSTATE_INTERNED_IMMORTAL:
1587 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001588
Benjamin Peterson29060642009-01-31 22:14:21 +00001589 default:
1590 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001591 }
1592
Victor Stinner03490912011-10-03 23:45:12 +02001593 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001594 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001595 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001596 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001597 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1598 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001599
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001600 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001601}
1602
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001603#ifdef Py_DEBUG
1604static int
1605unicode_is_singleton(PyObject *unicode)
1606{
1607 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1608 if (unicode == unicode_empty)
1609 return 1;
1610 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1611 {
1612 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1613 if (ch < 256 && unicode_latin1[ch] == unicode)
1614 return 1;
1615 }
1616 return 0;
1617}
1618#endif
1619
Alexander Belopolsky40018472011-02-26 01:02:56 +00001620static int
Victor Stinner488fa492011-12-12 00:01:39 +01001621unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001622{
Victor Stinner488fa492011-12-12 00:01:39 +01001623 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001624 if (Py_REFCNT(unicode) != 1)
1625 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001626 if (_PyUnicode_HASH(unicode) != -1)
1627 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001628 if (PyUnicode_CHECK_INTERNED(unicode))
1629 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001630 if (!PyUnicode_CheckExact(unicode))
1631 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001632#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001633 /* singleton refcount is greater than 1 */
1634 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001635#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001636 return 1;
1637}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001638
Victor Stinnerfe226c02011-10-03 03:52:20 +02001639static int
1640unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1641{
1642 PyObject *unicode;
1643 Py_ssize_t old_length;
1644
1645 assert(p_unicode != NULL);
1646 unicode = *p_unicode;
1647
1648 assert(unicode != NULL);
1649 assert(PyUnicode_Check(unicode));
1650 assert(0 <= length);
1651
Victor Stinner910337b2011-10-03 03:20:16 +02001652 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001653 old_length = PyUnicode_WSTR_LENGTH(unicode);
1654 else
1655 old_length = PyUnicode_GET_LENGTH(unicode);
1656 if (old_length == length)
1657 return 0;
1658
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001659 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001660 _Py_INCREF_UNICODE_EMPTY();
1661 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001662 return -1;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001663 Py_DECREF(*p_unicode);
1664 *p_unicode = unicode_empty;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001665 return 0;
1666 }
1667
Victor Stinner488fa492011-12-12 00:01:39 +01001668 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001669 PyObject *copy = resize_copy(unicode, length);
1670 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001671 return -1;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001672 Py_DECREF(*p_unicode);
1673 *p_unicode = copy;
Benjamin Peterson29060642009-01-31 22:14:21 +00001674 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001675 }
1676
Victor Stinnerfe226c02011-10-03 03:52:20 +02001677 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001678 PyObject *new_unicode = resize_compact(unicode, length);
1679 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001680 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001681 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001682 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001683 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001684 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001685}
1686
Alexander Belopolsky40018472011-02-26 01:02:56 +00001687int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001688PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001689{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001690 PyObject *unicode;
1691 if (p_unicode == NULL) {
1692 PyErr_BadInternalCall();
1693 return -1;
1694 }
1695 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001696 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001697 {
1698 PyErr_BadInternalCall();
1699 return -1;
1700 }
1701 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001702}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001703
Victor Stinnerc5166102012-02-22 13:55:02 +01001704/* Copy a ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001705
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001706 WARNING: The function doesn't copy the terminating null character and
1707 doesn't check the maximum character (may write a latin1 character in an
1708 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001709static void
1710unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1711 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001712{
1713 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1714 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001715 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001716
1717 switch (kind) {
1718 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001719 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001720#ifdef Py_DEBUG
1721 if (PyUnicode_IS_ASCII(unicode)) {
1722 Py_UCS4 maxchar = ucs1lib_find_max_char(
1723 (const Py_UCS1*)str,
1724 (const Py_UCS1*)str + len);
1725 assert(maxchar < 128);
1726 }
1727#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001728 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001729 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001730 }
1731 case PyUnicode_2BYTE_KIND: {
1732 Py_UCS2 *start = (Py_UCS2 *)data + index;
1733 Py_UCS2 *ucs2 = start;
1734 assert(index <= PyUnicode_GET_LENGTH(unicode));
1735
Victor Stinner184252a2012-06-16 02:57:41 +02001736 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001737 *ucs2 = (Py_UCS2)*str;
1738
1739 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001740 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001741 }
1742 default: {
1743 Py_UCS4 *start = (Py_UCS4 *)data + index;
1744 Py_UCS4 *ucs4 = start;
1745 assert(kind == PyUnicode_4BYTE_KIND);
1746 assert(index <= PyUnicode_GET_LENGTH(unicode));
1747
Victor Stinner184252a2012-06-16 02:57:41 +02001748 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001749 *ucs4 = (Py_UCS4)*str;
1750
1751 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001752 }
1753 }
1754}
1755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756static PyObject*
1757get_latin1_char(unsigned char ch)
1758{
Victor Stinnera464fc12011-10-02 20:39:30 +02001759 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001761 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 if (!unicode)
1763 return NULL;
1764 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001765 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766 unicode_latin1[ch] = unicode;
1767 }
1768 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001769 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001770}
1771
Victor Stinner985a82a2014-01-03 12:53:47 +01001772static PyObject*
1773unicode_char(Py_UCS4 ch)
1774{
1775 PyObject *unicode;
1776
1777 assert(ch <= MAX_UNICODE);
1778
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001779 if (ch < 256)
1780 return get_latin1_char(ch);
1781
Victor Stinner985a82a2014-01-03 12:53:47 +01001782 unicode = PyUnicode_New(1, ch);
1783 if (unicode == NULL)
1784 return NULL;
1785 switch (PyUnicode_KIND(unicode)) {
1786 case PyUnicode_1BYTE_KIND:
1787 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1788 break;
1789 case PyUnicode_2BYTE_KIND:
1790 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1791 break;
1792 default:
1793 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1794 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1795 }
1796 assert(_PyUnicode_CheckConsistency(unicode, 1));
1797 return unicode;
1798}
1799
Alexander Belopolsky40018472011-02-26 01:02:56 +00001800PyObject *
1801PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001803 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001804 Py_UCS4 maxchar = 0;
1805 Py_ssize_t num_surrogates;
1806
1807 if (u == NULL)
1808 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001809
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001810 /* If the Unicode data is known at construction time, we can apply
1811 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001812
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001813 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001814 if (size == 0)
1815 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001817 /* Single character Unicode objects in the Latin-1 range are
1818 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001819 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001820 return get_latin1_char((unsigned char)*u);
1821
1822 /* If not empty and not single character, copy the Unicode data
1823 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001824 if (find_maxchar_surrogates(u, u + size,
1825 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001826 return NULL;
1827
Victor Stinner8faf8212011-12-08 22:14:11 +01001828 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001829 if (!unicode)
1830 return NULL;
1831
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001832 switch (PyUnicode_KIND(unicode)) {
1833 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001834 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1836 break;
1837 case PyUnicode_2BYTE_KIND:
1838#if Py_UNICODE_SIZE == 2
1839 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1840#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001841 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001842 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1843#endif
1844 break;
1845 case PyUnicode_4BYTE_KIND:
1846#if SIZEOF_WCHAR_T == 2
1847 /* This is the only case which has to process surrogates, thus
1848 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001849 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001850#else
1851 assert(num_surrogates == 0);
1852 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1853#endif
1854 break;
1855 default:
1856 assert(0 && "Impossible state");
1857 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001859 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001860}
1861
Alexander Belopolsky40018472011-02-26 01:02:56 +00001862PyObject *
1863PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001864{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001865 if (size < 0) {
1866 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001867 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001868 return NULL;
1869 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001870 if (u != NULL)
1871 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1872 else
1873 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001874}
1875
Alexander Belopolsky40018472011-02-26 01:02:56 +00001876PyObject *
1877PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001878{
1879 size_t size = strlen(u);
1880 if (size > PY_SSIZE_T_MAX) {
1881 PyErr_SetString(PyExc_OverflowError, "input too long");
1882 return NULL;
1883 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001884 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001885}
1886
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001887PyObject *
1888_PyUnicode_FromId(_Py_Identifier *id)
1889{
1890 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001891 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1892 strlen(id->string),
1893 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001894 if (!id->object)
1895 return NULL;
1896 PyUnicode_InternInPlace(&id->object);
1897 assert(!id->next);
1898 id->next = static_strings;
1899 static_strings = id;
1900 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001901 return id->object;
1902}
1903
1904void
1905_PyUnicode_ClearStaticStrings()
1906{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001907 _Py_Identifier *tmp, *s = static_strings;
1908 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02001909 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001910 tmp = s->next;
1911 s->next = NULL;
1912 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001913 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001914 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001915}
1916
Benjamin Peterson0df54292012-03-26 14:50:32 -04001917/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001918
Victor Stinnerd3f08822012-05-29 12:57:52 +02001919PyObject*
1920_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001921{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001922 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001923 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001924 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001925#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001926 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001927#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001928 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001929 }
Victor Stinner785938e2011-12-11 20:09:03 +01001930 unicode = PyUnicode_New(size, 127);
1931 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001932 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001933 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1934 assert(_PyUnicode_CheckConsistency(unicode, 1));
1935 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001936}
1937
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001938static Py_UCS4
1939kind_maxchar_limit(unsigned int kind)
1940{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001941 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001942 case PyUnicode_1BYTE_KIND:
1943 return 0x80;
1944 case PyUnicode_2BYTE_KIND:
1945 return 0x100;
1946 case PyUnicode_4BYTE_KIND:
1947 return 0x10000;
1948 default:
1949 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001950 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001951 }
1952}
1953
Victor Stinnere6abb482012-05-02 01:15:40 +02001954Py_LOCAL_INLINE(Py_UCS4)
1955align_maxchar(Py_UCS4 maxchar)
1956{
1957 if (maxchar <= 127)
1958 return 127;
1959 else if (maxchar <= 255)
1960 return 255;
1961 else if (maxchar <= 65535)
1962 return 65535;
1963 else
1964 return MAX_UNICODE;
1965}
1966
Victor Stinner702c7342011-10-05 13:50:52 +02001967static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001968_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001969{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001970 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001971 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001972
Serhiy Storchaka678db842013-01-26 12:16:36 +02001973 if (size == 0)
1974 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001975 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001976 if (size == 1)
1977 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001978
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001979 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001980 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001981 if (!res)
1982 return NULL;
1983 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001984 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001985 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001986}
1987
Victor Stinnere57b1c02011-09-28 22:20:48 +02001988static PyObject*
1989_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001990{
1991 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001992 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001993
Serhiy Storchaka678db842013-01-26 12:16:36 +02001994 if (size == 0)
1995 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001996 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01001997 if (size == 1)
1998 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001999
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002000 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002001 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002002 if (!res)
2003 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002004 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002005 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002006 else {
2007 _PyUnicode_CONVERT_BYTES(
2008 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2009 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002010 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002011 return res;
2012}
2013
Victor Stinnere57b1c02011-09-28 22:20:48 +02002014static PyObject*
2015_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002016{
2017 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002018 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002019
Serhiy Storchaka678db842013-01-26 12:16:36 +02002020 if (size == 0)
2021 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002022 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002023 if (size == 1)
2024 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002025
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002026 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002027 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028 if (!res)
2029 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002030 if (max_char < 256)
2031 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2032 PyUnicode_1BYTE_DATA(res));
2033 else if (max_char < 0x10000)
2034 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2035 PyUnicode_2BYTE_DATA(res));
2036 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002037 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002038 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002039 return res;
2040}
2041
2042PyObject*
2043PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2044{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002045 if (size < 0) {
2046 PyErr_SetString(PyExc_ValueError, "size must be positive");
2047 return NULL;
2048 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002049 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002050 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002051 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002052 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002053 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002054 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002055 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002056 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002057 PyErr_SetString(PyExc_SystemError, "invalid kind");
2058 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002059 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002060}
2061
Victor Stinnerece58de2012-04-23 23:36:38 +02002062Py_UCS4
2063_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2064{
2065 enum PyUnicode_Kind kind;
2066 void *startptr, *endptr;
2067
2068 assert(PyUnicode_IS_READY(unicode));
2069 assert(0 <= start);
2070 assert(end <= PyUnicode_GET_LENGTH(unicode));
2071 assert(start <= end);
2072
2073 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2074 return PyUnicode_MAX_CHAR_VALUE(unicode);
2075
2076 if (start == end)
2077 return 127;
2078
Victor Stinner94d558b2012-04-27 22:26:58 +02002079 if (PyUnicode_IS_ASCII(unicode))
2080 return 127;
2081
Victor Stinnerece58de2012-04-23 23:36:38 +02002082 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002083 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002084 endptr = (char *)startptr + end * kind;
2085 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002086 switch(kind) {
2087 case PyUnicode_1BYTE_KIND:
2088 return ucs1lib_find_max_char(startptr, endptr);
2089 case PyUnicode_2BYTE_KIND:
2090 return ucs2lib_find_max_char(startptr, endptr);
2091 case PyUnicode_4BYTE_KIND:
2092 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002093 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002094 assert(0);
2095 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002096 }
2097}
2098
Victor Stinner25a4b292011-10-06 12:31:55 +02002099/* Ensure that a string uses the most efficient storage, if it is not the
2100 case: create a new string with of the right kind. Write NULL into *p_unicode
2101 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002102static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002103unicode_adjust_maxchar(PyObject **p_unicode)
2104{
2105 PyObject *unicode, *copy;
2106 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002107 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002108 unsigned int kind;
2109
2110 assert(p_unicode != NULL);
2111 unicode = *p_unicode;
2112 assert(PyUnicode_IS_READY(unicode));
2113 if (PyUnicode_IS_ASCII(unicode))
2114 return;
2115
2116 len = PyUnicode_GET_LENGTH(unicode);
2117 kind = PyUnicode_KIND(unicode);
2118 if (kind == PyUnicode_1BYTE_KIND) {
2119 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002120 max_char = ucs1lib_find_max_char(u, u + len);
2121 if (max_char >= 128)
2122 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002123 }
2124 else if (kind == PyUnicode_2BYTE_KIND) {
2125 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002126 max_char = ucs2lib_find_max_char(u, u + len);
2127 if (max_char >= 256)
2128 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002129 }
2130 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002131 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002132 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002133 max_char = ucs4lib_find_max_char(u, u + len);
2134 if (max_char >= 0x10000)
2135 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002136 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002137 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002138 if (copy != NULL)
2139 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002140 Py_DECREF(unicode);
2141 *p_unicode = copy;
2142}
2143
Victor Stinner034f6cf2011-09-30 02:26:44 +02002144PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002145_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002146{
Victor Stinner87af4f22011-11-21 23:03:47 +01002147 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002148 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002149
Victor Stinner034f6cf2011-09-30 02:26:44 +02002150 if (!PyUnicode_Check(unicode)) {
2151 PyErr_BadInternalCall();
2152 return NULL;
2153 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002154 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002155 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002156
Victor Stinner87af4f22011-11-21 23:03:47 +01002157 length = PyUnicode_GET_LENGTH(unicode);
2158 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002159 if (!copy)
2160 return NULL;
2161 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2162
Victor Stinner87af4f22011-11-21 23:03:47 +01002163 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2164 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002165 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002166 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002167}
2168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002169
Victor Stinnerbc603d12011-10-02 01:00:40 +02002170/* Widen Unicode objects to larger buffers. Don't write terminating null
2171 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002172
2173void*
2174_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2175{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002176 Py_ssize_t len;
2177 void *result;
2178 unsigned int skind;
2179
Benjamin Petersonbac79492012-01-14 13:34:47 -05002180 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002181 return NULL;
2182
2183 len = PyUnicode_GET_LENGTH(s);
2184 skind = PyUnicode_KIND(s);
2185 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002186 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002187 return NULL;
2188 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002189 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002190 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002191 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002192 if (!result)
2193 return PyErr_NoMemory();
2194 assert(skind == PyUnicode_1BYTE_KIND);
2195 _PyUnicode_CONVERT_BYTES(
2196 Py_UCS1, Py_UCS2,
2197 PyUnicode_1BYTE_DATA(s),
2198 PyUnicode_1BYTE_DATA(s) + len,
2199 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002200 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002201 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002202 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002203 if (!result)
2204 return PyErr_NoMemory();
2205 if (skind == PyUnicode_2BYTE_KIND) {
2206 _PyUnicode_CONVERT_BYTES(
2207 Py_UCS2, Py_UCS4,
2208 PyUnicode_2BYTE_DATA(s),
2209 PyUnicode_2BYTE_DATA(s) + len,
2210 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002211 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002212 else {
2213 assert(skind == PyUnicode_1BYTE_KIND);
2214 _PyUnicode_CONVERT_BYTES(
2215 Py_UCS1, Py_UCS4,
2216 PyUnicode_1BYTE_DATA(s),
2217 PyUnicode_1BYTE_DATA(s) + len,
2218 result);
2219 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002221 default:
2222 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 }
Victor Stinner01698042011-10-04 00:04:26 +02002224 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002225 return NULL;
2226}
2227
2228static Py_UCS4*
2229as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2230 int copy_null)
2231{
2232 int kind;
2233 void *data;
2234 Py_ssize_t len, targetlen;
2235 if (PyUnicode_READY(string) == -1)
2236 return NULL;
2237 kind = PyUnicode_KIND(string);
2238 data = PyUnicode_DATA(string);
2239 len = PyUnicode_GET_LENGTH(string);
2240 targetlen = len;
2241 if (copy_null)
2242 targetlen++;
2243 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002244 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002245 if (!target) {
2246 PyErr_NoMemory();
2247 return NULL;
2248 }
2249 }
2250 else {
2251 if (targetsize < targetlen) {
2252 PyErr_Format(PyExc_SystemError,
2253 "string is longer than the buffer");
2254 if (copy_null && 0 < targetsize)
2255 target[0] = 0;
2256 return NULL;
2257 }
2258 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002259 if (kind == PyUnicode_1BYTE_KIND) {
2260 Py_UCS1 *start = (Py_UCS1 *) data;
2261 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002262 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002263 else if (kind == PyUnicode_2BYTE_KIND) {
2264 Py_UCS2 *start = (Py_UCS2 *) data;
2265 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2266 }
2267 else {
2268 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002269 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002270 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271 if (copy_null)
2272 target[len] = 0;
2273 return target;
2274}
2275
2276Py_UCS4*
2277PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2278 int copy_null)
2279{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002280 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002281 PyErr_BadInternalCall();
2282 return NULL;
2283 }
2284 return as_ucs4(string, target, targetsize, copy_null);
2285}
2286
2287Py_UCS4*
2288PyUnicode_AsUCS4Copy(PyObject *string)
2289{
2290 return as_ucs4(string, NULL, 0, 1);
2291}
2292
2293#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002294
Alexander Belopolsky40018472011-02-26 01:02:56 +00002295PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002296PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002299 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002300 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002301 PyErr_BadInternalCall();
2302 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002303 }
2304
Martin v. Löwis790465f2008-04-05 20:41:37 +00002305 if (size == -1) {
2306 size = wcslen(w);
2307 }
2308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002309 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002310}
2311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002312#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002313
Victor Stinner15a11362012-10-06 23:48:20 +02002314/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002315 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2316 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2317#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002318
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002319static int
2320unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2321 Py_ssize_t width, Py_ssize_t precision)
2322{
2323 Py_ssize_t length, fill, arglen;
2324 Py_UCS4 maxchar;
2325
2326 if (PyUnicode_READY(str) == -1)
2327 return -1;
2328
2329 length = PyUnicode_GET_LENGTH(str);
2330 if ((precision == -1 || precision >= length)
2331 && width <= length)
2332 return _PyUnicodeWriter_WriteStr(writer, str);
2333
2334 if (precision != -1)
2335 length = Py_MIN(precision, length);
2336
2337 arglen = Py_MAX(length, width);
2338 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2339 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2340 else
2341 maxchar = writer->maxchar;
2342
2343 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2344 return -1;
2345
2346 if (width > length) {
2347 fill = width - length;
2348 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2349 return -1;
2350 writer->pos += fill;
2351 }
2352
2353 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2354 str, 0, length);
2355 writer->pos += length;
2356 return 0;
2357}
2358
2359static int
2360unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2361 Py_ssize_t width, Py_ssize_t precision)
2362{
2363 /* UTF-8 */
2364 Py_ssize_t length;
2365 PyObject *unicode;
2366 int res;
2367
2368 length = strlen(str);
2369 if (precision != -1)
2370 length = Py_MIN(length, precision);
2371 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2372 if (unicode == NULL)
2373 return -1;
2374
2375 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2376 Py_DECREF(unicode);
2377 return res;
2378}
2379
Victor Stinner96865452011-03-01 23:44:09 +00002380static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002381unicode_fromformat_arg(_PyUnicodeWriter *writer,
2382 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002383{
Victor Stinnere215d962012-10-06 23:03:36 +02002384 const char *p;
2385 Py_ssize_t len;
2386 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002387 Py_ssize_t width;
2388 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002389 int longflag;
2390 int longlongflag;
2391 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002392 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002393
2394 p = f;
2395 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002396 zeropad = 0;
2397 if (*f == '0') {
2398 zeropad = 1;
2399 f++;
2400 }
Victor Stinner96865452011-03-01 23:44:09 +00002401
2402 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002403 width = -1;
2404 if (Py_ISDIGIT((unsigned)*f)) {
2405 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002406 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002407 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002408 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002409 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002410 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002411 return NULL;
2412 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002413 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002414 f++;
2415 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002416 }
2417 precision = -1;
2418 if (*f == '.') {
2419 f++;
2420 if (Py_ISDIGIT((unsigned)*f)) {
2421 precision = (*f - '0');
2422 f++;
2423 while (Py_ISDIGIT((unsigned)*f)) {
2424 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2425 PyErr_SetString(PyExc_ValueError,
2426 "precision too big");
2427 return NULL;
2428 }
2429 precision = (precision * 10) + (*f - '0');
2430 f++;
2431 }
2432 }
Victor Stinner96865452011-03-01 23:44:09 +00002433 if (*f == '%') {
2434 /* "%.3%s" => f points to "3" */
2435 f--;
2436 }
2437 }
2438 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002439 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002440 f--;
2441 }
Victor Stinner96865452011-03-01 23:44:09 +00002442
2443 /* Handle %ld, %lu, %lld and %llu. */
2444 longflag = 0;
2445 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002446 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002447 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002448 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002449 longflag = 1;
2450 ++f;
2451 }
2452#ifdef HAVE_LONG_LONG
2453 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002454 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002455 longlongflag = 1;
2456 f += 2;
2457 }
2458#endif
2459 }
2460 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002461 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002462 size_tflag = 1;
2463 ++f;
2464 }
Victor Stinnere215d962012-10-06 23:03:36 +02002465
2466 if (f[1] == '\0')
2467 writer->overallocate = 0;
2468
2469 switch (*f) {
2470 case 'c':
2471 {
2472 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002473 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002474 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002475 "character argument not in range(0x110000)");
2476 return NULL;
2477 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002478 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002479 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002480 break;
2481 }
2482
2483 case 'i':
2484 case 'd':
2485 case 'u':
2486 case 'x':
2487 {
2488 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002489 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002490 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002491
2492 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002493 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002494 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002495 va_arg(*vargs, unsigned long));
2496#ifdef HAVE_LONG_LONG
2497 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002498 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002499 va_arg(*vargs, unsigned PY_LONG_LONG));
2500#endif
2501 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002502 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002503 va_arg(*vargs, size_t));
2504 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002505 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002506 va_arg(*vargs, unsigned int));
2507 }
2508 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002509 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002510 }
2511 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002512 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002513 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002514 va_arg(*vargs, long));
2515#ifdef HAVE_LONG_LONG
2516 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002517 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002518 va_arg(*vargs, PY_LONG_LONG));
2519#endif
2520 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002521 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002522 va_arg(*vargs, Py_ssize_t));
2523 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002524 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002525 va_arg(*vargs, int));
2526 }
2527 assert(len >= 0);
2528
Victor Stinnere215d962012-10-06 23:03:36 +02002529 if (precision < len)
2530 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002531
2532 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002533 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2534 return NULL;
2535
Victor Stinnere215d962012-10-06 23:03:36 +02002536 if (width > precision) {
2537 Py_UCS4 fillchar;
2538 fill = width - precision;
2539 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002540 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2541 return NULL;
2542 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002543 }
Victor Stinner15a11362012-10-06 23:48:20 +02002544 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002545 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002546 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2547 return NULL;
2548 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002549 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002550
Victor Stinner4a587072013-11-19 12:54:53 +01002551 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2552 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002553 break;
2554 }
2555
2556 case 'p':
2557 {
2558 char number[MAX_LONG_LONG_CHARS];
2559
2560 len = sprintf(number, "%p", va_arg(*vargs, void*));
2561 assert(len >= 0);
2562
2563 /* %p is ill-defined: ensure leading 0x. */
2564 if (number[1] == 'X')
2565 number[1] = 'x';
2566 else if (number[1] != 'x') {
2567 memmove(number + 2, number,
2568 strlen(number) + 1);
2569 number[0] = '0';
2570 number[1] = 'x';
2571 len += 2;
2572 }
2573
Victor Stinner4a587072013-11-19 12:54:53 +01002574 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002575 return NULL;
2576 break;
2577 }
2578
2579 case 's':
2580 {
2581 /* UTF-8 */
2582 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002583 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002584 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002585 break;
2586 }
2587
2588 case 'U':
2589 {
2590 PyObject *obj = va_arg(*vargs, PyObject *);
2591 assert(obj && _PyUnicode_CHECK(obj));
2592
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002593 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002594 return NULL;
2595 break;
2596 }
2597
2598 case 'V':
2599 {
2600 PyObject *obj = va_arg(*vargs, PyObject *);
2601 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002602 if (obj) {
2603 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002604 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002605 return NULL;
2606 }
2607 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002608 assert(str != NULL);
2609 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002610 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002611 }
2612 break;
2613 }
2614
2615 case 'S':
2616 {
2617 PyObject *obj = va_arg(*vargs, PyObject *);
2618 PyObject *str;
2619 assert(obj);
2620 str = PyObject_Str(obj);
2621 if (!str)
2622 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002623 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002624 Py_DECREF(str);
2625 return NULL;
2626 }
2627 Py_DECREF(str);
2628 break;
2629 }
2630
2631 case 'R':
2632 {
2633 PyObject *obj = va_arg(*vargs, PyObject *);
2634 PyObject *repr;
2635 assert(obj);
2636 repr = PyObject_Repr(obj);
2637 if (!repr)
2638 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002639 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002640 Py_DECREF(repr);
2641 return NULL;
2642 }
2643 Py_DECREF(repr);
2644 break;
2645 }
2646
2647 case 'A':
2648 {
2649 PyObject *obj = va_arg(*vargs, PyObject *);
2650 PyObject *ascii;
2651 assert(obj);
2652 ascii = PyObject_ASCII(obj);
2653 if (!ascii)
2654 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002655 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002656 Py_DECREF(ascii);
2657 return NULL;
2658 }
2659 Py_DECREF(ascii);
2660 break;
2661 }
2662
2663 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002664 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002665 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002666 break;
2667
2668 default:
2669 /* if we stumble upon an unknown formatting code, copy the rest
2670 of the format string to the output string. (we cannot just
2671 skip the code, since there's no way to know what's in the
2672 argument list) */
2673 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002674 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002675 return NULL;
2676 f = p+len;
2677 return f;
2678 }
2679
2680 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002681 return f;
2682}
2683
Walter Dörwaldd2034312007-05-18 16:29:38 +00002684PyObject *
2685PyUnicode_FromFormatV(const char *format, va_list vargs)
2686{
Victor Stinnere215d962012-10-06 23:03:36 +02002687 va_list vargs2;
2688 const char *f;
2689 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002690
Victor Stinner8f674cc2013-04-17 23:02:17 +02002691 _PyUnicodeWriter_Init(&writer);
2692 writer.min_length = strlen(format) + 100;
2693 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002694
2695 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2696 Copy it to be able to pass a reference to a subfunction. */
2697 Py_VA_COPY(vargs2, vargs);
2698
2699 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002700 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002701 f = unicode_fromformat_arg(&writer, f, &vargs2);
2702 if (f == NULL)
2703 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002704 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002705 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002706 const char *p;
2707 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002708
Victor Stinnere215d962012-10-06 23:03:36 +02002709 p = f;
2710 do
2711 {
2712 if ((unsigned char)*p > 127) {
2713 PyErr_Format(PyExc_ValueError,
2714 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2715 "string, got a non-ASCII byte: 0x%02x",
2716 (unsigned char)*p);
2717 return NULL;
2718 }
2719 p++;
2720 }
2721 while (*p != '\0' && *p != '%');
2722 len = p - f;
2723
2724 if (*p == '\0')
2725 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002726
2727 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002728 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002729
2730 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002731 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002732 }
Victor Stinnere215d962012-10-06 23:03:36 +02002733 return _PyUnicodeWriter_Finish(&writer);
2734
2735 fail:
2736 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002737 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002738}
2739
Walter Dörwaldd2034312007-05-18 16:29:38 +00002740PyObject *
2741PyUnicode_FromFormat(const char *format, ...)
2742{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002743 PyObject* ret;
2744 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002745
2746#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002747 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002748#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002749 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002750#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002751 ret = PyUnicode_FromFormatV(format, vargs);
2752 va_end(vargs);
2753 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002754}
2755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002756#ifdef HAVE_WCHAR_H
2757
Victor Stinner5593d8a2010-10-02 11:11:27 +00002758/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2759 convert a Unicode object to a wide character string.
2760
Victor Stinnerd88d9832011-09-06 02:00:05 +02002761 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002762 character) required to convert the unicode object. Ignore size argument.
2763
Victor Stinnerd88d9832011-09-06 02:00:05 +02002764 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002765 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002766 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002767static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002768unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002769 wchar_t *w,
2770 Py_ssize_t size)
2771{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002772 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002773 const wchar_t *wstr;
2774
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002775 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002776 if (wstr == NULL)
2777 return -1;
2778
Victor Stinner5593d8a2010-10-02 11:11:27 +00002779 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002780 if (size > res)
2781 size = res + 1;
2782 else
2783 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002784 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002785 return res;
2786 }
2787 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002788 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002789}
2790
2791Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002792PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002793 wchar_t *w,
2794 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002795{
2796 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002797 PyErr_BadInternalCall();
2798 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002800 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801}
2802
Victor Stinner137c34c2010-09-29 10:25:54 +00002803wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002804PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002805 Py_ssize_t *size)
2806{
2807 wchar_t* buffer;
2808 Py_ssize_t buflen;
2809
2810 if (unicode == NULL) {
2811 PyErr_BadInternalCall();
2812 return NULL;
2813 }
2814
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002815 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002816 if (buflen == -1)
2817 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002818 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00002819 if (buffer == NULL) {
2820 PyErr_NoMemory();
2821 return NULL;
2822 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002823 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002824 if (buflen == -1) {
2825 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002826 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002827 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002828 if (size != NULL)
2829 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002830 return buffer;
2831}
2832
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002833#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834
Alexander Belopolsky40018472011-02-26 01:02:56 +00002835PyObject *
2836PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002837{
Victor Stinner8faf8212011-12-08 22:14:11 +01002838 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002839 PyErr_SetString(PyExc_ValueError,
2840 "chr() arg not in range(0x110000)");
2841 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002842 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002843
Victor Stinner985a82a2014-01-03 12:53:47 +01002844 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002845}
2846
Alexander Belopolsky40018472011-02-26 01:02:56 +00002847PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002848PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002849{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002850 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002851 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002852 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002853 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002854 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002855 Py_INCREF(obj);
2856 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002857 }
2858 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002859 /* For a Unicode subtype that's not a Unicode object,
2860 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002861 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002862 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002863 PyErr_Format(PyExc_TypeError,
2864 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002865 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002866 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002867}
2868
Alexander Belopolsky40018472011-02-26 01:02:56 +00002869PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002870PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002871 const char *encoding,
2872 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002873{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002874 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002875 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002876
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002878 PyErr_BadInternalCall();
2879 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002880 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002881
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002882 /* Decoding bytes objects is the most common case and should be fast */
2883 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002884 if (PyBytes_GET_SIZE(obj) == 0)
2885 _Py_RETURN_UNICODE_EMPTY();
2886 v = PyUnicode_Decode(
2887 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2888 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002889 return v;
2890 }
2891
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002892 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002893 PyErr_SetString(PyExc_TypeError,
2894 "decoding str is not supported");
2895 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002896 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002897
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002898 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2899 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2900 PyErr_Format(PyExc_TypeError,
Serhiy Storchakab757c832014-12-05 22:25:22 +02002901 "coercing to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002902 Py_TYPE(obj)->tp_name);
2903 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002904 }
Tim Petersced69f82003-09-16 20:30:58 +00002905
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002906 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002907 PyBuffer_Release(&buffer);
2908 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002909 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002910
Serhiy Storchaka05997252013-01-26 12:14:02 +02002911 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002912 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002913 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002914}
2915
Victor Stinner600d3be2010-06-10 12:00:55 +00002916/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002917 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2918 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002919int
2920_Py_normalize_encoding(const char *encoding,
2921 char *lower,
2922 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002923{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002924 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002925 char *l;
2926 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002927
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002928 if (encoding == NULL) {
Victor Stinner66b32702013-11-07 23:12:23 +01002929 /* 6 == strlen("utf-8") + 1 */
Victor Stinnerdf23e302013-11-07 13:33:36 +01002930 if (lower_len < 6)
2931 return 0;
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002932 strcpy(lower, "utf-8");
2933 return 1;
2934 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002935 e = encoding;
2936 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002937 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002938 while (*e) {
2939 if (l == l_end)
2940 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002941 if (Py_ISUPPER(*e)) {
2942 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002943 }
2944 else if (*e == '_') {
2945 *l++ = '-';
2946 e++;
2947 }
2948 else {
2949 *l++ = *e++;
2950 }
2951 }
2952 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002953 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002954}
2955
Alexander Belopolsky40018472011-02-26 01:02:56 +00002956PyObject *
2957PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002958 Py_ssize_t size,
2959 const char *encoding,
2960 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002961{
2962 PyObject *buffer = NULL, *unicode;
2963 Py_buffer info;
2964 char lower[11]; /* Enough for any encoding shortcut */
2965
Fred Drakee4315f52000-05-09 19:53:39 +00002966 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002967 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002968 if ((strcmp(lower, "utf-8") == 0) ||
2969 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002970 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002971 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002972 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01002973 (strcmp(lower, "iso-8859-1") == 0) ||
2974 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002975 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002976#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002977 else if (strcmp(lower, "mbcs") == 0)
2978 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002979#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002980 else if (strcmp(lower, "ascii") == 0)
2981 return PyUnicode_DecodeASCII(s, size, errors);
2982 else if (strcmp(lower, "utf-16") == 0)
2983 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2984 else if (strcmp(lower, "utf-32") == 0)
2985 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2986 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002987
2988 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002989 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002990 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002991 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002992 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993 if (buffer == NULL)
2994 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002995 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002996 if (unicode == NULL)
2997 goto onError;
2998 if (!PyUnicode_Check(unicode)) {
2999 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003000 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3001 "use codecs.decode() to decode to arbitrary types",
3002 encoding,
3003 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003004 Py_DECREF(unicode);
3005 goto onError;
3006 }
3007 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003008 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003009
Benjamin Peterson29060642009-01-31 22:14:21 +00003010 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003011 Py_XDECREF(buffer);
3012 return NULL;
3013}
3014
Alexander Belopolsky40018472011-02-26 01:02:56 +00003015PyObject *
3016PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003017 const char *encoding,
3018 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003019{
3020 PyObject *v;
3021
3022 if (!PyUnicode_Check(unicode)) {
3023 PyErr_BadArgument();
3024 goto onError;
3025 }
3026
3027 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003028 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003029
3030 /* Decode via the codec registry */
3031 v = PyCodec_Decode(unicode, encoding, errors);
3032 if (v == NULL)
3033 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003034 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003035
Benjamin Peterson29060642009-01-31 22:14:21 +00003036 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003037 return NULL;
3038}
3039
Alexander Belopolsky40018472011-02-26 01:02:56 +00003040PyObject *
3041PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003042 const char *encoding,
3043 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003044{
3045 PyObject *v;
3046
3047 if (!PyUnicode_Check(unicode)) {
3048 PyErr_BadArgument();
3049 goto onError;
3050 }
3051
3052 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003053 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003054
3055 /* Decode via the codec registry */
3056 v = PyCodec_Decode(unicode, encoding, errors);
3057 if (v == NULL)
3058 goto onError;
3059 if (!PyUnicode_Check(v)) {
3060 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003061 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3062 "use codecs.decode() to decode to arbitrary types",
3063 encoding,
3064 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003065 Py_DECREF(v);
3066 goto onError;
3067 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003068 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003069
Benjamin Peterson29060642009-01-31 22:14:21 +00003070 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003071 return NULL;
3072}
3073
Alexander Belopolsky40018472011-02-26 01:02:56 +00003074PyObject *
3075PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003076 Py_ssize_t size,
3077 const char *encoding,
3078 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003079{
3080 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003081
Guido van Rossumd57fd912000-03-10 22:53:23 +00003082 unicode = PyUnicode_FromUnicode(s, size);
3083 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003084 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3086 Py_DECREF(unicode);
3087 return v;
3088}
3089
Alexander Belopolsky40018472011-02-26 01:02:56 +00003090PyObject *
3091PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003092 const char *encoding,
3093 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003094{
3095 PyObject *v;
3096
3097 if (!PyUnicode_Check(unicode)) {
3098 PyErr_BadArgument();
3099 goto onError;
3100 }
3101
3102 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003103 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003104
3105 /* Encode via the codec registry */
3106 v = PyCodec_Encode(unicode, encoding, errors);
3107 if (v == NULL)
3108 goto onError;
3109 return v;
3110
Benjamin Peterson29060642009-01-31 22:14:21 +00003111 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003112 return NULL;
3113}
3114
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003115static size_t
3116wcstombs_errorpos(const wchar_t *wstr)
3117{
3118 size_t len;
3119#if SIZEOF_WCHAR_T == 2
3120 wchar_t buf[3];
3121#else
3122 wchar_t buf[2];
3123#endif
3124 char outbuf[MB_LEN_MAX];
3125 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003126
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003127#if SIZEOF_WCHAR_T == 2
3128 buf[2] = 0;
3129#else
3130 buf[1] = 0;
3131#endif
3132 start = wstr;
3133 while (*wstr != L'\0')
3134 {
3135 previous = wstr;
3136#if SIZEOF_WCHAR_T == 2
3137 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3138 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3139 {
3140 buf[0] = wstr[0];
3141 buf[1] = wstr[1];
3142 wstr += 2;
3143 }
3144 else {
3145 buf[0] = *wstr;
3146 buf[1] = 0;
3147 wstr++;
3148 }
3149#else
3150 buf[0] = *wstr;
3151 wstr++;
3152#endif
3153 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003154 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003155 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003156 }
3157
3158 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003159 return 0;
3160}
3161
Victor Stinner1b579672011-12-17 05:47:23 +01003162static int
3163locale_error_handler(const char *errors, int *surrogateescape)
3164{
3165 if (errors == NULL) {
3166 *surrogateescape = 0;
3167 return 0;
3168 }
3169
3170 if (strcmp(errors, "strict") == 0) {
3171 *surrogateescape = 0;
3172 return 0;
3173 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003174 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003175 *surrogateescape = 1;
3176 return 0;
3177 }
3178 PyErr_Format(PyExc_ValueError,
3179 "only 'strict' and 'surrogateescape' error handlers "
3180 "are supported, not '%s'",
3181 errors);
3182 return -1;
3183}
3184
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003185PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003186PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003187{
3188 Py_ssize_t wlen, wlen2;
3189 wchar_t *wstr;
3190 PyObject *bytes = NULL;
3191 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003192 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003193 PyObject *exc;
3194 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003195 int surrogateescape;
3196
3197 if (locale_error_handler(errors, &surrogateescape) < 0)
3198 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003199
3200 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3201 if (wstr == NULL)
3202 return NULL;
3203
3204 wlen2 = wcslen(wstr);
3205 if (wlen2 != wlen) {
3206 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003207 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003208 return NULL;
3209 }
3210
3211 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003212 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003213 char *str;
3214
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003215 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003216 if (str == NULL) {
3217 if (error_pos == (size_t)-1) {
3218 PyErr_NoMemory();
3219 PyMem_Free(wstr);
3220 return NULL;
3221 }
3222 else {
3223 goto encode_error;
3224 }
3225 }
3226 PyMem_Free(wstr);
3227
3228 bytes = PyBytes_FromString(str);
3229 PyMem_Free(str);
3230 }
3231 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003232 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003233 size_t len, len2;
3234
3235 len = wcstombs(NULL, wstr, 0);
3236 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003237 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003238 goto encode_error;
3239 }
3240
3241 bytes = PyBytes_FromStringAndSize(NULL, len);
3242 if (bytes == NULL) {
3243 PyMem_Free(wstr);
3244 return NULL;
3245 }
3246
3247 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3248 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003249 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003250 goto encode_error;
3251 }
3252 PyMem_Free(wstr);
3253 }
3254 return bytes;
3255
3256encode_error:
3257 errmsg = strerror(errno);
3258 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003259
3260 if (error_pos == (size_t)-1)
3261 error_pos = wcstombs_errorpos(wstr);
3262
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003263 PyMem_Free(wstr);
3264 Py_XDECREF(bytes);
3265
Victor Stinner2f197072011-12-17 07:08:30 +01003266 if (errmsg != NULL) {
3267 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003268 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003269 if (wstr != NULL) {
3270 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003271 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003272 } else
3273 errmsg = NULL;
3274 }
3275 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003276 reason = PyUnicode_FromString(
3277 "wcstombs() encountered an unencodable "
3278 "wide character");
3279 if (reason == NULL)
3280 return NULL;
3281
3282 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3283 "locale", unicode,
3284 (Py_ssize_t)error_pos,
3285 (Py_ssize_t)(error_pos+1),
3286 reason);
3287 Py_DECREF(reason);
3288 if (exc != NULL) {
3289 PyCodec_StrictErrors(exc);
3290 Py_XDECREF(exc);
3291 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003292 return NULL;
3293}
3294
Victor Stinnerad158722010-10-27 00:25:46 +00003295PyObject *
3296PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003297{
Victor Stinner99b95382011-07-04 14:23:54 +02003298#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003299 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003300#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003301 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003302#else
Victor Stinner793b5312011-04-27 00:24:21 +02003303 PyInterpreterState *interp = PyThreadState_GET()->interp;
3304 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3305 cannot use it to encode and decode filenames before it is loaded. Load
3306 the Python codec requires to encode at least its own filename. Use the C
3307 version of the locale codec until the codec registry is initialized and
3308 the Python codec is loaded.
3309
3310 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3311 cannot only rely on it: check also interp->fscodec_initialized for
3312 subinterpreters. */
3313 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003314 return PyUnicode_AsEncodedString(unicode,
3315 Py_FileSystemDefaultEncoding,
3316 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003317 }
3318 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003319 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003320 }
Victor Stinnerad158722010-10-27 00:25:46 +00003321#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003322}
3323
Alexander Belopolsky40018472011-02-26 01:02:56 +00003324PyObject *
3325PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003326 const char *encoding,
3327 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003328{
3329 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003330 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003331
Guido van Rossumd57fd912000-03-10 22:53:23 +00003332 if (!PyUnicode_Check(unicode)) {
3333 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003334 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003335 }
Fred Drakee4315f52000-05-09 19:53:39 +00003336
Fred Drakee4315f52000-05-09 19:53:39 +00003337 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003338 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003339 if ((strcmp(lower, "utf-8") == 0) ||
3340 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003341 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003342 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003343 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003344 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003345 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003346 }
Victor Stinner37296e82010-06-10 13:36:23 +00003347 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003348 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003349 (strcmp(lower, "iso-8859-1") == 0) ||
3350 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003351 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003352#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003353 else if (strcmp(lower, "mbcs") == 0)
3354 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003355#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003356 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003357 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359
3360 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003361 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003362 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003363 return NULL;
3364
3365 /* The normal path */
3366 if (PyBytes_Check(v))
3367 return v;
3368
3369 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003370 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003371 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003372 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003373
3374 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003375 "encoder %s returned bytearray instead of bytes; "
3376 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003377 encoding);
3378 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003379 Py_DECREF(v);
3380 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003381 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003382
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003383 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3384 Py_DECREF(v);
3385 return b;
3386 }
3387
3388 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003389 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3390 "use codecs.encode() to encode to arbitrary types",
3391 encoding,
3392 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003393 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003394 return NULL;
3395}
3396
Alexander Belopolsky40018472011-02-26 01:02:56 +00003397PyObject *
3398PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003399 const char *encoding,
3400 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003401{
3402 PyObject *v;
3403
3404 if (!PyUnicode_Check(unicode)) {
3405 PyErr_BadArgument();
3406 goto onError;
3407 }
3408
3409 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003410 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003411
3412 /* Encode via the codec registry */
3413 v = PyCodec_Encode(unicode, encoding, errors);
3414 if (v == NULL)
3415 goto onError;
3416 if (!PyUnicode_Check(v)) {
3417 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003418 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3419 "use codecs.encode() to encode to arbitrary types",
3420 encoding,
3421 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003422 Py_DECREF(v);
3423 goto onError;
3424 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003425 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003426
Benjamin Peterson29060642009-01-31 22:14:21 +00003427 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003428 return NULL;
3429}
3430
Victor Stinner2f197072011-12-17 07:08:30 +01003431static size_t
3432mbstowcs_errorpos(const char *str, size_t len)
3433{
3434#ifdef HAVE_MBRTOWC
3435 const char *start = str;
3436 mbstate_t mbs;
3437 size_t converted;
3438 wchar_t ch;
3439
3440 memset(&mbs, 0, sizeof mbs);
3441 while (len)
3442 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003443 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003444 if (converted == 0)
3445 /* Reached end of string */
3446 break;
3447 if (converted == (size_t)-1 || converted == (size_t)-2) {
3448 /* Conversion error or incomplete character */
3449 return str - start;
3450 }
3451 else {
3452 str += converted;
3453 len -= converted;
3454 }
3455 }
3456 /* failed to find the undecodable byte sequence */
3457 return 0;
3458#endif
3459 return 0;
3460}
3461
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003462PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003463PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003464 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003465{
3466 wchar_t smallbuf[256];
3467 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3468 wchar_t *wstr;
3469 size_t wlen, wlen2;
3470 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003471 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003472 size_t error_pos;
3473 char *errmsg;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01003474 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3475 PyObject *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003476
3477 if (locale_error_handler(errors, &surrogateescape) < 0)
3478 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003479
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003480 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3481 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003482 return NULL;
3483 }
3484
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003485 if (surrogateescape) {
3486 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003487 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003488 if (wstr == NULL) {
3489 if (wlen == (size_t)-1)
3490 PyErr_NoMemory();
3491 else
3492 PyErr_SetFromErrno(PyExc_OSError);
3493 return NULL;
3494 }
3495
3496 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003497 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003498 }
3499 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003500 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003501#ifndef HAVE_BROKEN_MBSTOWCS
3502 wlen = mbstowcs(NULL, str, 0);
3503#else
3504 wlen = len;
3505#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003506 if (wlen == (size_t)-1)
3507 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003508 if (wlen+1 <= smallbuf_len) {
3509 wstr = smallbuf;
3510 }
3511 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003512 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003513 if (!wstr)
3514 return PyErr_NoMemory();
3515 }
3516
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003517 wlen2 = mbstowcs(wstr, str, wlen+1);
3518 if (wlen2 == (size_t)-1) {
3519 if (wstr != smallbuf)
3520 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003521 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003522 }
3523#ifdef HAVE_BROKEN_MBSTOWCS
3524 assert(wlen2 == wlen);
3525#endif
3526 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3527 if (wstr != smallbuf)
3528 PyMem_Free(wstr);
3529 }
3530 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003531
3532decode_error:
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003533 reason = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003534 errmsg = strerror(errno);
3535 assert(errmsg != NULL);
3536
3537 error_pos = mbstowcs_errorpos(str, len);
3538 if (errmsg != NULL) {
3539 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003540 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003541 if (wstr != NULL) {
3542 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003543 PyMem_RawFree(wstr);
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003544 }
Victor Stinner2f197072011-12-17 07:08:30 +01003545 }
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003546 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003547 reason = PyUnicode_FromString(
3548 "mbstowcs() encountered an invalid multibyte sequence");
3549 if (reason == NULL)
3550 return NULL;
3551
3552 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3553 "locale", str, len,
3554 (Py_ssize_t)error_pos,
3555 (Py_ssize_t)(error_pos+1),
3556 reason);
3557 Py_DECREF(reason);
3558 if (exc != NULL) {
3559 PyCodec_StrictErrors(exc);
3560 Py_XDECREF(exc);
3561 }
3562 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003563}
3564
3565PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003566PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003567{
3568 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003569 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003570}
3571
3572
3573PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003574PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003575 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003576 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3577}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003578
Christian Heimes5894ba72007-11-04 11:43:14 +00003579PyObject*
3580PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3581{
Victor Stinner99b95382011-07-04 14:23:54 +02003582#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003583 return PyUnicode_DecodeMBCS(s, size, NULL);
3584#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003585 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003586#else
Victor Stinner793b5312011-04-27 00:24:21 +02003587 PyInterpreterState *interp = PyThreadState_GET()->interp;
3588 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3589 cannot use it to encode and decode filenames before it is loaded. Load
3590 the Python codec requires to encode at least its own filename. Use the C
3591 version of the locale codec until the codec registry is initialized and
3592 the Python codec is loaded.
3593
3594 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3595 cannot only rely on it: check also interp->fscodec_initialized for
3596 subinterpreters. */
3597 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003598 return PyUnicode_Decode(s, size,
3599 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003600 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003601 }
3602 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003603 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003604 }
Victor Stinnerad158722010-10-27 00:25:46 +00003605#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003606}
3607
Martin v. Löwis011e8422009-05-05 04:43:17 +00003608
3609int
3610PyUnicode_FSConverter(PyObject* arg, void* addr)
3611{
3612 PyObject *output = NULL;
3613 Py_ssize_t size;
3614 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003615 if (arg == NULL) {
3616 Py_DECREF(*(PyObject**)addr);
3617 return 1;
3618 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003619 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003620 output = arg;
3621 Py_INCREF(output);
3622 }
3623 else {
3624 arg = PyUnicode_FromObject(arg);
3625 if (!arg)
3626 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003627 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003628 Py_DECREF(arg);
3629 if (!output)
3630 return 0;
3631 if (!PyBytes_Check(output)) {
3632 Py_DECREF(output);
3633 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3634 return 0;
3635 }
3636 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003637 size = PyBytes_GET_SIZE(output);
3638 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003639 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003640 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003641 Py_DECREF(output);
3642 return 0;
3643 }
3644 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003645 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003646}
3647
3648
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003649int
3650PyUnicode_FSDecoder(PyObject* arg, void* addr)
3651{
3652 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003653 if (arg == NULL) {
3654 Py_DECREF(*(PyObject**)addr);
3655 return 1;
3656 }
3657 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003658 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003659 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003660 output = arg;
3661 Py_INCREF(output);
3662 }
3663 else {
3664 arg = PyBytes_FromObject(arg);
3665 if (!arg)
3666 return 0;
3667 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3668 PyBytes_GET_SIZE(arg));
3669 Py_DECREF(arg);
3670 if (!output)
3671 return 0;
3672 if (!PyUnicode_Check(output)) {
3673 Py_DECREF(output);
3674 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3675 return 0;
3676 }
3677 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003678 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003679 Py_DECREF(output);
3680 return 0;
3681 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003682 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003683 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003684 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003685 Py_DECREF(output);
3686 return 0;
3687 }
3688 *(PyObject**)addr = output;
3689 return Py_CLEANUP_SUPPORTED;
3690}
3691
3692
Martin v. Löwis5b222132007-06-10 09:51:05 +00003693char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003694PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003695{
Christian Heimesf3863112007-11-22 07:46:41 +00003696 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003697
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003698 if (!PyUnicode_Check(unicode)) {
3699 PyErr_BadArgument();
3700 return NULL;
3701 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003702 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003703 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003704
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003705 if (PyUnicode_UTF8(unicode) == NULL) {
3706 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003707 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3708 if (bytes == NULL)
3709 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003710 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3711 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003712 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003713 Py_DECREF(bytes);
3714 return NULL;
3715 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003716 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3717 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3718 PyBytes_AS_STRING(bytes),
3719 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003720 Py_DECREF(bytes);
3721 }
3722
3723 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003724 *psize = PyUnicode_UTF8_LENGTH(unicode);
3725 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003726}
3727
3728char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003729PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003730{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003731 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3732}
3733
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003734Py_UNICODE *
3735PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3736{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003737 const unsigned char *one_byte;
3738#if SIZEOF_WCHAR_T == 4
3739 const Py_UCS2 *two_bytes;
3740#else
3741 const Py_UCS4 *four_bytes;
3742 const Py_UCS4 *ucs4_end;
3743 Py_ssize_t num_surrogates;
3744#endif
3745 wchar_t *w;
3746 wchar_t *wchar_end;
3747
3748 if (!PyUnicode_Check(unicode)) {
3749 PyErr_BadArgument();
3750 return NULL;
3751 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003752 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003753 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003754 assert(_PyUnicode_KIND(unicode) != 0);
3755 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003756
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003757 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003758#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003759 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3760 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003761 num_surrogates = 0;
3762
3763 for (; four_bytes < ucs4_end; ++four_bytes) {
3764 if (*four_bytes > 0xFFFF)
3765 ++num_surrogates;
3766 }
3767
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003768 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3769 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3770 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003771 PyErr_NoMemory();
3772 return NULL;
3773 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003774 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003775
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003776 w = _PyUnicode_WSTR(unicode);
3777 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3778 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003779 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3780 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003781 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003782 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003783 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3784 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003785 }
3786 else
3787 *w = *four_bytes;
3788
3789 if (w > wchar_end) {
3790 assert(0 && "Miscalculated string end");
3791 }
3792 }
3793 *w = 0;
3794#else
3795 /* sizeof(wchar_t) == 4 */
3796 Py_FatalError("Impossible unicode object state, wstr and str "
3797 "should share memory already.");
3798 return NULL;
3799#endif
3800 }
3801 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02003802 if ((size_t)_PyUnicode_LENGTH(unicode) >
3803 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3804 PyErr_NoMemory();
3805 return NULL;
3806 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003807 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3808 (_PyUnicode_LENGTH(unicode) + 1));
3809 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003810 PyErr_NoMemory();
3811 return NULL;
3812 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003813 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3814 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3815 w = _PyUnicode_WSTR(unicode);
3816 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003817
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003818 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3819 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003820 for (; w < wchar_end; ++one_byte, ++w)
3821 *w = *one_byte;
3822 /* null-terminate the wstr */
3823 *w = 0;
3824 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003825 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003826#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003827 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003828 for (; w < wchar_end; ++two_bytes, ++w)
3829 *w = *two_bytes;
3830 /* null-terminate the wstr */
3831 *w = 0;
3832#else
3833 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003834 PyObject_FREE(_PyUnicode_WSTR(unicode));
3835 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003836 Py_FatalError("Impossible unicode object state, wstr "
3837 "and str should share memory already.");
3838 return NULL;
3839#endif
3840 }
3841 else {
3842 assert(0 && "This should never happen.");
3843 }
3844 }
3845 }
3846 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003847 *size = PyUnicode_WSTR_LENGTH(unicode);
3848 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003849}
3850
Alexander Belopolsky40018472011-02-26 01:02:56 +00003851Py_UNICODE *
3852PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003853{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003854 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003855}
3856
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003857
Alexander Belopolsky40018472011-02-26 01:02:56 +00003858Py_ssize_t
3859PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003860{
3861 if (!PyUnicode_Check(unicode)) {
3862 PyErr_BadArgument();
3863 goto onError;
3864 }
3865 return PyUnicode_GET_SIZE(unicode);
3866
Benjamin Peterson29060642009-01-31 22:14:21 +00003867 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003868 return -1;
3869}
3870
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003871Py_ssize_t
3872PyUnicode_GetLength(PyObject *unicode)
3873{
Victor Stinner07621332012-06-16 04:53:46 +02003874 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003875 PyErr_BadArgument();
3876 return -1;
3877 }
Victor Stinner07621332012-06-16 04:53:46 +02003878 if (PyUnicode_READY(unicode) == -1)
3879 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003880 return PyUnicode_GET_LENGTH(unicode);
3881}
3882
3883Py_UCS4
3884PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3885{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003886 void *data;
3887 int kind;
3888
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003889 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3890 PyErr_BadArgument();
3891 return (Py_UCS4)-1;
3892 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003893 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003894 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003895 return (Py_UCS4)-1;
3896 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003897 data = PyUnicode_DATA(unicode);
3898 kind = PyUnicode_KIND(unicode);
3899 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003900}
3901
3902int
3903PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3904{
3905 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003906 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003907 return -1;
3908 }
Victor Stinner488fa492011-12-12 00:01:39 +01003909 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003910 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003911 PyErr_SetString(PyExc_IndexError, "string index out of range");
3912 return -1;
3913 }
Victor Stinner488fa492011-12-12 00:01:39 +01003914 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003915 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003916 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3917 PyErr_SetString(PyExc_ValueError, "character out of range");
3918 return -1;
3919 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003920 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3921 index, ch);
3922 return 0;
3923}
3924
Alexander Belopolsky40018472011-02-26 01:02:56 +00003925const char *
3926PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003927{
Victor Stinner42cb4622010-09-01 19:39:01 +00003928 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003929}
3930
Victor Stinner554f3f02010-06-16 23:33:54 +00003931/* create or adjust a UnicodeDecodeError */
3932static void
3933make_decode_exception(PyObject **exceptionObject,
3934 const char *encoding,
3935 const char *input, Py_ssize_t length,
3936 Py_ssize_t startpos, Py_ssize_t endpos,
3937 const char *reason)
3938{
3939 if (*exceptionObject == NULL) {
3940 *exceptionObject = PyUnicodeDecodeError_Create(
3941 encoding, input, length, startpos, endpos, reason);
3942 }
3943 else {
3944 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3945 goto onError;
3946 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3947 goto onError;
3948 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3949 goto onError;
3950 }
3951 return;
3952
3953onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02003954 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00003955}
3956
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003957#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003958/* error handling callback helper:
3959 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003960 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003961 and adjust various state variables.
3962 return 0 on success, -1 on error
3963*/
3964
Alexander Belopolsky40018472011-02-26 01:02:56 +00003965static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003966unicode_decode_call_errorhandler_wchar(
3967 const char *errors, PyObject **errorHandler,
3968 const char *encoding, const char *reason,
3969 const char **input, const char **inend, Py_ssize_t *startinpos,
3970 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3971 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003972{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003973 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003974
3975 PyObject *restuple = NULL;
3976 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003977 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003978 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003979 Py_ssize_t requiredsize;
3980 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003981 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003982 wchar_t *repwstr;
3983 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003984
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003985 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
3986 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01003987
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003988 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003989 *errorHandler = PyCodec_LookupError(errors);
3990 if (*errorHandler == NULL)
3991 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003992 }
3993
Victor Stinner554f3f02010-06-16 23:33:54 +00003994 make_decode_exception(exceptionObject,
3995 encoding,
3996 *input, *inend - *input,
3997 *startinpos, *endinpos,
3998 reason);
3999 if (*exceptionObject == NULL)
4000 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004001
4002 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4003 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004004 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004005 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004006 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004007 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004008 }
4009 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004010 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004011
4012 /* Copy back the bytes variables, which might have been modified by the
4013 callback */
4014 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4015 if (!inputobj)
4016 goto onError;
4017 if (!PyBytes_Check(inputobj)) {
4018 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4019 }
4020 *input = PyBytes_AS_STRING(inputobj);
4021 insize = PyBytes_GET_SIZE(inputobj);
4022 *inend = *input + insize;
4023 /* we can DECREF safely, as the exception has another reference,
4024 so the object won't go away. */
4025 Py_DECREF(inputobj);
4026
4027 if (newpos<0)
4028 newpos = insize+newpos;
4029 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004030 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004031 goto onError;
4032 }
4033
4034 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4035 if (repwstr == NULL)
4036 goto onError;
4037 /* need more space? (at least enough for what we
4038 have+the replacement+the rest of the string (starting
4039 at the new input position), so we won't have to check space
4040 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004041 requiredsize = *outpos;
4042 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4043 goto overflow;
4044 requiredsize += repwlen;
4045 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4046 goto overflow;
4047 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004048 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004049 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004050 requiredsize = 2*outsize;
4051 if (unicode_resize(output, requiredsize) < 0)
4052 goto onError;
4053 }
4054 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4055 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004056 *endinpos = newpos;
4057 *inptr = *input + newpos;
4058
4059 /* we made it! */
4060 Py_XDECREF(restuple);
4061 return 0;
4062
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004063 overflow:
4064 PyErr_SetString(PyExc_OverflowError,
4065 "decoded result is too long for a Python string");
4066
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004067 onError:
4068 Py_XDECREF(restuple);
4069 return -1;
4070}
4071#endif /* HAVE_MBCS */
4072
4073static int
4074unicode_decode_call_errorhandler_writer(
4075 const char *errors, PyObject **errorHandler,
4076 const char *encoding, const char *reason,
4077 const char **input, const char **inend, Py_ssize_t *startinpos,
4078 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4079 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4080{
4081 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4082
4083 PyObject *restuple = NULL;
4084 PyObject *repunicode = NULL;
4085 Py_ssize_t insize;
4086 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004087 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004088 PyObject *inputobj = NULL;
4089
4090 if (*errorHandler == NULL) {
4091 *errorHandler = PyCodec_LookupError(errors);
4092 if (*errorHandler == NULL)
4093 goto onError;
4094 }
4095
4096 make_decode_exception(exceptionObject,
4097 encoding,
4098 *input, *inend - *input,
4099 *startinpos, *endinpos,
4100 reason);
4101 if (*exceptionObject == NULL)
4102 goto onError;
4103
4104 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4105 if (restuple == NULL)
4106 goto onError;
4107 if (!PyTuple_Check(restuple)) {
4108 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4109 goto onError;
4110 }
4111 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004112 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004113
4114 /* Copy back the bytes variables, which might have been modified by the
4115 callback */
4116 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4117 if (!inputobj)
4118 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004119 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004120 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004121 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004122 *input = PyBytes_AS_STRING(inputobj);
4123 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004124 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004125 /* we can DECREF safely, as the exception has another reference,
4126 so the object won't go away. */
4127 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004128
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004129 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004130 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004131 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004132 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004133 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004134 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004135
Victor Stinner8f674cc2013-04-17 23:02:17 +02004136 if (PyUnicode_READY(repunicode) < 0)
4137 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004138 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004139 if (replen > 1) {
4140 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004141 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004142 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4143 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4144 goto onError;
4145 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004146 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004147 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004148
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004149 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004150 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004151
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004152 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004153 Py_XDECREF(restuple);
4154 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004155
Benjamin Peterson29060642009-01-31 22:14:21 +00004156 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004157 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004158 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004159}
4160
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004161/* --- UTF-7 Codec -------------------------------------------------------- */
4162
Antoine Pitrou244651a2009-05-04 18:56:13 +00004163/* See RFC2152 for details. We encode conservatively and decode liberally. */
4164
4165/* Three simple macros defining base-64. */
4166
4167/* Is c a base-64 character? */
4168
4169#define IS_BASE64(c) \
4170 (((c) >= 'A' && (c) <= 'Z') || \
4171 ((c) >= 'a' && (c) <= 'z') || \
4172 ((c) >= '0' && (c) <= '9') || \
4173 (c) == '+' || (c) == '/')
4174
4175/* given that c is a base-64 character, what is its base-64 value? */
4176
4177#define FROM_BASE64(c) \
4178 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4179 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4180 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4181 (c) == '+' ? 62 : 63)
4182
4183/* What is the base-64 character of the bottom 6 bits of n? */
4184
4185#define TO_BASE64(n) \
4186 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4187
4188/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4189 * decoded as itself. We are permissive on decoding; the only ASCII
4190 * byte not decoding to itself is the + which begins a base64
4191 * string. */
4192
4193#define DECODE_DIRECT(c) \
4194 ((c) <= 127 && (c) != '+')
4195
4196/* The UTF-7 encoder treats ASCII characters differently according to
4197 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4198 * the above). See RFC2152. This array identifies these different
4199 * sets:
4200 * 0 : "Set D"
4201 * alphanumeric and '(),-./:?
4202 * 1 : "Set O"
4203 * !"#$%&*;<=>@[]^_`{|}
4204 * 2 : "whitespace"
4205 * ht nl cr sp
4206 * 3 : special (must be base64 encoded)
4207 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4208 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004209
Tim Petersced69f82003-09-16 20:30:58 +00004210static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004211char utf7_category[128] = {
4212/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4213 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4214/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4215 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4216/* sp ! " # $ % & ' ( ) * + , - . / */
4217 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4218/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4219 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4220/* @ A B C D E F G H I J K L M N O */
4221 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4222/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4223 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4224/* ` a b c d e f g h i j k l m n o */
4225 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4226/* p q r s t u v w x y z { | } ~ del */
4227 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004228};
4229
Antoine Pitrou244651a2009-05-04 18:56:13 +00004230/* ENCODE_DIRECT: this character should be encoded as itself. The
4231 * answer depends on whether we are encoding set O as itself, and also
4232 * on whether we are encoding whitespace as itself. RFC2152 makes it
4233 * clear that the answers to these questions vary between
4234 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004235
Antoine Pitrou244651a2009-05-04 18:56:13 +00004236#define ENCODE_DIRECT(c, directO, directWS) \
4237 ((c) < 128 && (c) > 0 && \
4238 ((utf7_category[(c)] == 0) || \
4239 (directWS && (utf7_category[(c)] == 2)) || \
4240 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004241
Alexander Belopolsky40018472011-02-26 01:02:56 +00004242PyObject *
4243PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004244 Py_ssize_t size,
4245 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004246{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004247 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4248}
4249
Antoine Pitrou244651a2009-05-04 18:56:13 +00004250/* The decoder. The only state we preserve is our read position,
4251 * i.e. how many characters we have consumed. So if we end in the
4252 * middle of a shift sequence we have to back off the read position
4253 * and the output to the beginning of the sequence, otherwise we lose
4254 * all the shift state (seen bits, number of bits seen, high
4255 * surrogate). */
4256
Alexander Belopolsky40018472011-02-26 01:02:56 +00004257PyObject *
4258PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004259 Py_ssize_t size,
4260 const char *errors,
4261 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004262{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004263 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004264 Py_ssize_t startinpos;
4265 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004266 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004267 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004268 const char *errmsg = "";
4269 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004270 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004271 unsigned int base64bits = 0;
4272 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004273 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004274 PyObject *errorHandler = NULL;
4275 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004276
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004277 if (size == 0) {
4278 if (consumed)
4279 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004280 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004281 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004282
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004283 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004284 _PyUnicodeWriter_Init(&writer);
4285 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004286
4287 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004288 e = s + size;
4289
4290 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004291 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004292 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004293 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004294
Antoine Pitrou244651a2009-05-04 18:56:13 +00004295 if (inShift) { /* in a base-64 section */
4296 if (IS_BASE64(ch)) { /* consume a base-64 character */
4297 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4298 base64bits += 6;
4299 s++;
4300 if (base64bits >= 16) {
4301 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004302 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004303 base64bits -= 16;
4304 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004305 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004306 if (surrogate) {
4307 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004308 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4309 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004310 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004311 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004312 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004313 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004314 }
4315 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004316 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004317 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004318 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004319 }
4320 }
Victor Stinner551ac952011-11-29 22:58:13 +01004321 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004322 /* first surrogate */
4323 surrogate = outCh;
4324 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004325 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004326 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004327 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004328 }
4329 }
4330 }
4331 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004332 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004333 if (base64bits > 0) { /* left-over bits */
4334 if (base64bits >= 6) {
4335 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004336 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004337 errmsg = "partial character in shift sequence";
4338 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004339 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004340 else {
4341 /* Some bits remain; they should be zero */
4342 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004343 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004344 errmsg = "non-zero padding bits in shift sequence";
4345 goto utf7Error;
4346 }
4347 }
4348 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004349 if (surrogate && DECODE_DIRECT(ch)) {
4350 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4351 goto onError;
4352 }
4353 surrogate = 0;
4354 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004355 /* '-' is absorbed; other terminating
4356 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004357 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004358 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004359 }
4360 }
4361 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004362 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004363 s++; /* consume '+' */
4364 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004365 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004366 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004367 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004368 }
4369 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004370 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004371 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004372 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004373 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004374 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004375 }
4376 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004377 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004378 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004379 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004380 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004381 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004382 else {
4383 startinpos = s-starts;
4384 s++;
4385 errmsg = "unexpected special character";
4386 goto utf7Error;
4387 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004388 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004389utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004390 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004391 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004392 errors, &errorHandler,
4393 "utf7", errmsg,
4394 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004395 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004396 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004397 }
4398
Antoine Pitrou244651a2009-05-04 18:56:13 +00004399 /* end of string */
4400
4401 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4402 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004403 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004404 if (surrogate ||
4405 (base64bits >= 6) ||
4406 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004407 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004408 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004409 errors, &errorHandler,
4410 "utf7", "unterminated shift sequence",
4411 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004412 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004413 goto onError;
4414 if (s < e)
4415 goto restart;
4416 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004417 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004418
4419 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004420 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004421 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004422 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004423 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004424 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004425 writer.kind, writer.data, shiftOutStart);
4426 Py_XDECREF(errorHandler);
4427 Py_XDECREF(exc);
4428 _PyUnicodeWriter_Dealloc(&writer);
4429 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004430 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004431 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004432 }
4433 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004434 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004435 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004436 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004437
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004438 Py_XDECREF(errorHandler);
4439 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004440 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004441
Benjamin Peterson29060642009-01-31 22:14:21 +00004442 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004443 Py_XDECREF(errorHandler);
4444 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004445 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004446 return NULL;
4447}
4448
4449
Alexander Belopolsky40018472011-02-26 01:02:56 +00004450PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004451_PyUnicode_EncodeUTF7(PyObject *str,
4452 int base64SetO,
4453 int base64WhiteSpace,
4454 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004455{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004456 int kind;
4457 void *data;
4458 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004459 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004460 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004461 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004462 unsigned int base64bits = 0;
4463 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004464 char * out;
4465 char * start;
4466
Benjamin Petersonbac79492012-01-14 13:34:47 -05004467 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004468 return NULL;
4469 kind = PyUnicode_KIND(str);
4470 data = PyUnicode_DATA(str);
4471 len = PyUnicode_GET_LENGTH(str);
4472
4473 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004474 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004475
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004476 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004477 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004478 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004479 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004480 if (v == NULL)
4481 return NULL;
4482
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004483 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004484 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004485 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004486
Antoine Pitrou244651a2009-05-04 18:56:13 +00004487 if (inShift) {
4488 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4489 /* shifting out */
4490 if (base64bits) { /* output remaining bits */
4491 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4492 base64buffer = 0;
4493 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004494 }
4495 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004496 /* Characters not in the BASE64 set implicitly unshift the sequence
4497 so no '-' is required, except if the character is itself a '-' */
4498 if (IS_BASE64(ch) || ch == '-') {
4499 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004500 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004501 *out++ = (char) ch;
4502 }
4503 else {
4504 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004505 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004506 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004507 else { /* not in a shift sequence */
4508 if (ch == '+') {
4509 *out++ = '+';
4510 *out++ = '-';
4511 }
4512 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4513 *out++ = (char) ch;
4514 }
4515 else {
4516 *out++ = '+';
4517 inShift = 1;
4518 goto encode_char;
4519 }
4520 }
4521 continue;
4522encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004523 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004524 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004525
Antoine Pitrou244651a2009-05-04 18:56:13 +00004526 /* code first surrogate */
4527 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004528 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004529 while (base64bits >= 6) {
4530 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4531 base64bits -= 6;
4532 }
4533 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004534 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004535 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004536 base64bits += 16;
4537 base64buffer = (base64buffer << 16) | ch;
4538 while (base64bits >= 6) {
4539 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4540 base64bits -= 6;
4541 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004542 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004543 if (base64bits)
4544 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4545 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004546 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004547 if (_PyBytes_Resize(&v, out - start) < 0)
4548 return NULL;
4549 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004550}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004551PyObject *
4552PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4553 Py_ssize_t size,
4554 int base64SetO,
4555 int base64WhiteSpace,
4556 const char *errors)
4557{
4558 PyObject *result;
4559 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4560 if (tmp == NULL)
4561 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004562 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004563 base64WhiteSpace, errors);
4564 Py_DECREF(tmp);
4565 return result;
4566}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004567
Antoine Pitrou244651a2009-05-04 18:56:13 +00004568#undef IS_BASE64
4569#undef FROM_BASE64
4570#undef TO_BASE64
4571#undef DECODE_DIRECT
4572#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004573
Guido van Rossumd57fd912000-03-10 22:53:23 +00004574/* --- UTF-8 Codec -------------------------------------------------------- */
4575
Alexander Belopolsky40018472011-02-26 01:02:56 +00004576PyObject *
4577PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004578 Py_ssize_t size,
4579 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004580{
Walter Dörwald69652032004-09-07 20:24:22 +00004581 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4582}
4583
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004584#include "stringlib/asciilib.h"
4585#include "stringlib/codecs.h"
4586#include "stringlib/undef.h"
4587
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004588#include "stringlib/ucs1lib.h"
4589#include "stringlib/codecs.h"
4590#include "stringlib/undef.h"
4591
4592#include "stringlib/ucs2lib.h"
4593#include "stringlib/codecs.h"
4594#include "stringlib/undef.h"
4595
4596#include "stringlib/ucs4lib.h"
4597#include "stringlib/codecs.h"
4598#include "stringlib/undef.h"
4599
Antoine Pitrouab868312009-01-10 15:40:25 +00004600/* Mask to quickly check whether a C 'long' contains a
4601 non-ASCII, UTF8-encoded char. */
4602#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004603# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004604#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004605# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004606#else
4607# error C 'long' size should be either 4 or 8!
4608#endif
4609
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004610static Py_ssize_t
4611ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004612{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004613 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004614 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004615
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004616 /*
4617 * Issue #17237: m68k is a bit different from most architectures in
4618 * that objects do not use "natural alignment" - for example, int and
4619 * long are only aligned at 2-byte boundaries. Therefore the assert()
4620 * won't work; also, tests have shown that skipping the "optimised
4621 * version" will even speed up m68k.
4622 */
4623#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004624#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004625 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4626 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004627 /* Fast path, see in STRINGLIB(utf8_decode) for
4628 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004629 /* Help allocation */
4630 const char *_p = p;
4631 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004632 while (_p < aligned_end) {
4633 unsigned long value = *(const unsigned long *) _p;
4634 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004635 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004636 *((unsigned long *)q) = value;
4637 _p += SIZEOF_LONG;
4638 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004639 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004640 p = _p;
4641 while (p < end) {
4642 if ((unsigned char)*p & 0x80)
4643 break;
4644 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004645 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004646 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004647 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004648#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004649#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004650 while (p < end) {
4651 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4652 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004653 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004654 /* Help allocation */
4655 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004656 while (_p < aligned_end) {
4657 unsigned long value = *(unsigned long *) _p;
4658 if (value & ASCII_CHAR_MASK)
4659 break;
4660 _p += SIZEOF_LONG;
4661 }
4662 p = _p;
4663 if (_p == end)
4664 break;
4665 }
4666 if ((unsigned char)*p & 0x80)
4667 break;
4668 ++p;
4669 }
4670 memcpy(dest, start, p - start);
4671 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004672}
Antoine Pitrouab868312009-01-10 15:40:25 +00004673
Victor Stinner785938e2011-12-11 20:09:03 +01004674PyObject *
4675PyUnicode_DecodeUTF8Stateful(const char *s,
4676 Py_ssize_t size,
4677 const char *errors,
4678 Py_ssize_t *consumed)
4679{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004680 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004681 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004682 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004683
4684 Py_ssize_t startinpos;
4685 Py_ssize_t endinpos;
4686 const char *errmsg = "";
4687 PyObject *errorHandler = NULL;
4688 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004689
4690 if (size == 0) {
4691 if (consumed)
4692 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004693 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004694 }
4695
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004696 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4697 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004698 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004699 *consumed = 1;
4700 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004701 }
4702
Victor Stinner8f674cc2013-04-17 23:02:17 +02004703 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004704 writer.min_length = size;
4705 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004706 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004707
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004708 writer.pos = ascii_decode(s, end, writer.data);
4709 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004710 while (s < end) {
4711 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004712 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004713 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004714 if (PyUnicode_IS_ASCII(writer.buffer))
4715 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004716 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004717 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004718 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004719 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004720 } else {
4721 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004722 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004723 }
4724
4725 switch (ch) {
4726 case 0:
4727 if (s == end || consumed)
4728 goto End;
4729 errmsg = "unexpected end of data";
4730 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004731 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004732 break;
4733 case 1:
4734 errmsg = "invalid start byte";
4735 startinpos = s - starts;
4736 endinpos = startinpos + 1;
4737 break;
4738 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004739 case 3:
4740 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004741 errmsg = "invalid continuation byte";
4742 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004743 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004744 break;
4745 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004746 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004747 goto onError;
4748 continue;
4749 }
4750
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004751 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004752 errors, &errorHandler,
4753 "utf-8", errmsg,
4754 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004755 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004756 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004757 }
4758
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004759End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004760 if (consumed)
4761 *consumed = s - starts;
4762
4763 Py_XDECREF(errorHandler);
4764 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004765 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004766
4767onError:
4768 Py_XDECREF(errorHandler);
4769 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004770 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004771 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004772}
4773
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004774#ifdef __APPLE__
4775
4776/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004777 used to decode the command line arguments on Mac OS X.
4778
4779 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004780 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004781
4782wchar_t*
4783_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4784{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004785 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004786 wchar_t *unicode;
4787 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004788
4789 /* Note: size will always be longer than the resulting Unicode
4790 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01004791 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004792 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004793 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004794 if (!unicode)
4795 return NULL;
4796
4797 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004798 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004799 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004800 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004801 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004802#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004803 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004804#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004805 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004806#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004807 if (ch > 0xFF) {
4808#if SIZEOF_WCHAR_T == 4
4809 assert(0);
4810#else
4811 assert(Py_UNICODE_IS_SURROGATE(ch));
4812 /* compute and append the two surrogates: */
4813 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4814 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4815#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004816 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004817 else {
4818 if (!ch && s == e)
4819 break;
4820 /* surrogateescape */
4821 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4822 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004823 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004824 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004825 return unicode;
4826}
4827
4828#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004829
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004830/* Primary internal function which creates utf8 encoded bytes objects.
4831
4832 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004833 and allocate exactly as much space needed at the end. Else allocate the
4834 maximum possible needed (4 result bytes per Unicode character), and return
4835 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004836*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004837PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004838_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839{
Victor Stinner6099a032011-12-18 14:22:26 +01004840 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004841 void *data;
4842 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004843
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004844 if (!PyUnicode_Check(unicode)) {
4845 PyErr_BadArgument();
4846 return NULL;
4847 }
4848
4849 if (PyUnicode_READY(unicode) == -1)
4850 return NULL;
4851
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004852 if (PyUnicode_UTF8(unicode))
4853 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4854 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004855
4856 kind = PyUnicode_KIND(unicode);
4857 data = PyUnicode_DATA(unicode);
4858 size = PyUnicode_GET_LENGTH(unicode);
4859
Benjamin Petersonead6b532011-12-20 17:23:42 -06004860 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004861 default:
4862 assert(0);
4863 case PyUnicode_1BYTE_KIND:
4864 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4865 assert(!PyUnicode_IS_ASCII(unicode));
4866 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4867 case PyUnicode_2BYTE_KIND:
4868 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4869 case PyUnicode_4BYTE_KIND:
4870 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004871 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872}
4873
Alexander Belopolsky40018472011-02-26 01:02:56 +00004874PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004875PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4876 Py_ssize_t size,
4877 const char *errors)
4878{
4879 PyObject *v, *unicode;
4880
4881 unicode = PyUnicode_FromUnicode(s, size);
4882 if (unicode == NULL)
4883 return NULL;
4884 v = _PyUnicode_AsUTF8String(unicode, errors);
4885 Py_DECREF(unicode);
4886 return v;
4887}
4888
4889PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004890PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004892 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004893}
4894
Walter Dörwald41980ca2007-08-16 21:55:45 +00004895/* --- UTF-32 Codec ------------------------------------------------------- */
4896
4897PyObject *
4898PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004899 Py_ssize_t size,
4900 const char *errors,
4901 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004902{
4903 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4904}
4905
4906PyObject *
4907PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004908 Py_ssize_t size,
4909 const char *errors,
4910 int *byteorder,
4911 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004912{
4913 const char *starts = s;
4914 Py_ssize_t startinpos;
4915 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004916 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004917 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004918 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004919 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004920 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004921 PyObject *errorHandler = NULL;
4922 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004923
Walter Dörwald41980ca2007-08-16 21:55:45 +00004924 q = (unsigned char *)s;
4925 e = q + size;
4926
4927 if (byteorder)
4928 bo = *byteorder;
4929
4930 /* Check for BOM marks (U+FEFF) in the input and adjust current
4931 byte order setting accordingly. In native mode, the leading BOM
4932 mark is skipped, in all other modes, it is copied to the output
4933 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004934 if (bo == 0 && size >= 4) {
4935 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4936 if (bom == 0x0000FEFF) {
4937 bo = -1;
4938 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004939 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004940 else if (bom == 0xFFFE0000) {
4941 bo = 1;
4942 q += 4;
4943 }
4944 if (byteorder)
4945 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004946 }
4947
Victor Stinnere64322e2012-10-30 23:12:47 +01004948 if (q == e) {
4949 if (consumed)
4950 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004951 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004952 }
4953
Victor Stinnere64322e2012-10-30 23:12:47 +01004954#ifdef WORDS_BIGENDIAN
4955 le = bo < 0;
4956#else
4957 le = bo <= 0;
4958#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004959 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01004960
Victor Stinner8f674cc2013-04-17 23:02:17 +02004961 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004962 writer.min_length = (e - q + 3) / 4;
4963 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004964 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01004965
Victor Stinnere64322e2012-10-30 23:12:47 +01004966 while (1) {
4967 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004968 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004969
Victor Stinnere64322e2012-10-30 23:12:47 +01004970 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004971 enum PyUnicode_Kind kind = writer.kind;
4972 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01004973 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004974 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004975 if (le) {
4976 do {
4977 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4978 if (ch > maxch)
4979 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004980 if (kind != PyUnicode_1BYTE_KIND &&
4981 Py_UNICODE_IS_SURROGATE(ch))
4982 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004983 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004984 q += 4;
4985 } while (q <= last);
4986 }
4987 else {
4988 do {
4989 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
4990 if (ch > maxch)
4991 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004992 if (kind != PyUnicode_1BYTE_KIND &&
4993 Py_UNICODE_IS_SURROGATE(ch))
4994 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004995 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004996 q += 4;
4997 } while (q <= last);
4998 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004999 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005000 }
5001
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005002 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005003 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005004 startinpos = ((const char *)q) - starts;
5005 endinpos = startinpos + 4;
5006 }
5007 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005008 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005009 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005010 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005011 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005012 startinpos = ((const char *)q) - starts;
5013 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005014 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005015 else {
5016 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005017 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005018 goto onError;
5019 q += 4;
5020 continue;
5021 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005022 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005023 startinpos = ((const char *)q) - starts;
5024 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005025 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005026
5027 /* The remaining input chars are ignored if the callback
5028 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005029 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005030 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005031 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005032 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005033 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005034 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005035 }
5036
Walter Dörwald41980ca2007-08-16 21:55:45 +00005037 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005038 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005039
Walter Dörwald41980ca2007-08-16 21:55:45 +00005040 Py_XDECREF(errorHandler);
5041 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005042 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005043
Benjamin Peterson29060642009-01-31 22:14:21 +00005044 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005045 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005046 Py_XDECREF(errorHandler);
5047 Py_XDECREF(exc);
5048 return NULL;
5049}
5050
5051PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005052_PyUnicode_EncodeUTF32(PyObject *str,
5053 const char *errors,
5054 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005055{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005056 enum PyUnicode_Kind kind;
5057 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005058 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005059 PyObject *v;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005060 PY_UINT32_T *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005061#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005062 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005063#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005064 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005065#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005066 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005067 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005068 PyObject *errorHandler = NULL;
5069 PyObject *exc = NULL;
5070 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005071
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005072 if (!PyUnicode_Check(str)) {
5073 PyErr_BadArgument();
5074 return NULL;
5075 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005076 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005077 return NULL;
5078 kind = PyUnicode_KIND(str);
5079 data = PyUnicode_DATA(str);
5080 len = PyUnicode_GET_LENGTH(str);
5081
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005082 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005083 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005084 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005085 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005086 if (v == NULL)
5087 return NULL;
5088
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005089 /* output buffer is 4-bytes aligned */
5090 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5091 out = (PY_UINT32_T *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005092 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005093 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005094 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005095 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005096
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005097 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005098 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005099 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005100 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005101 else
5102 encoding = "utf-32";
5103
5104 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005105 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5106 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005107 }
5108
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005109 pos = 0;
5110 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005111 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005112
5113 if (kind == PyUnicode_2BYTE_KIND) {
5114 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5115 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005116 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005117 else {
5118 assert(kind == PyUnicode_4BYTE_KIND);
5119 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5120 &out, native_ordering);
5121 }
5122 if (pos == len)
5123 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005124
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005125 rep = unicode_encode_call_errorhandler(
5126 errors, &errorHandler,
5127 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005128 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005129 if (!rep)
5130 goto error;
5131
5132 if (PyBytes_Check(rep)) {
5133 repsize = PyBytes_GET_SIZE(rep);
5134 if (repsize & 3) {
5135 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005136 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005137 "surrogates not allowed");
5138 goto error;
5139 }
5140 moreunits = repsize / 4;
5141 }
5142 else {
5143 assert(PyUnicode_Check(rep));
5144 if (PyUnicode_READY(rep) < 0)
5145 goto error;
5146 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5147 if (!PyUnicode_IS_ASCII(rep)) {
5148 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005149 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005150 "surrogates not allowed");
5151 goto error;
5152 }
5153 }
5154
5155 /* four bytes are reserved for each surrogate */
5156 if (moreunits > 1) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005157 Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005158 Py_ssize_t morebytes = 4 * (moreunits - 1);
5159 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5160 /* integer overflow */
5161 PyErr_NoMemory();
5162 goto error;
5163 }
5164 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5165 goto error;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005166 out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005167 }
5168
5169 if (PyBytes_Check(rep)) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005170 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5171 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005172 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005173 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005174 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5175 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005176 }
5177
5178 Py_CLEAR(rep);
5179 }
5180
5181 /* Cut back to size actually needed. This is necessary for, for example,
5182 encoding of a string containing isolated surrogates and the 'ignore'
5183 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005184 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005185 if (nsize != PyBytes_GET_SIZE(v))
5186 _PyBytes_Resize(&v, nsize);
5187 Py_XDECREF(errorHandler);
5188 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005189 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005190 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005191 error:
5192 Py_XDECREF(rep);
5193 Py_XDECREF(errorHandler);
5194 Py_XDECREF(exc);
5195 Py_XDECREF(v);
5196 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005197}
5198
Alexander Belopolsky40018472011-02-26 01:02:56 +00005199PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005200PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5201 Py_ssize_t size,
5202 const char *errors,
5203 int byteorder)
5204{
5205 PyObject *result;
5206 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5207 if (tmp == NULL)
5208 return NULL;
5209 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5210 Py_DECREF(tmp);
5211 return result;
5212}
5213
5214PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005215PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005216{
Victor Stinnerb960b342011-11-20 19:12:52 +01005217 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005218}
5219
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220/* --- UTF-16 Codec ------------------------------------------------------- */
5221
Tim Peters772747b2001-08-09 22:21:55 +00005222PyObject *
5223PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005224 Py_ssize_t size,
5225 const char *errors,
5226 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227{
Walter Dörwald69652032004-09-07 20:24:22 +00005228 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5229}
5230
5231PyObject *
5232PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005233 Py_ssize_t size,
5234 const char *errors,
5235 int *byteorder,
5236 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005237{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005238 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005239 Py_ssize_t startinpos;
5240 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005241 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005242 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005243 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005244 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005245 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005246 PyObject *errorHandler = NULL;
5247 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005248 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005249
Tim Peters772747b2001-08-09 22:21:55 +00005250 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005251 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005252
5253 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005254 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005256 /* Check for BOM marks (U+FEFF) in the input and adjust current
5257 byte order setting accordingly. In native mode, the leading BOM
5258 mark is skipped, in all other modes, it is copied to the output
5259 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005260 if (bo == 0 && size >= 2) {
5261 const Py_UCS4 bom = (q[1] << 8) | q[0];
5262 if (bom == 0xFEFF) {
5263 q += 2;
5264 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005265 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005266 else if (bom == 0xFFFE) {
5267 q += 2;
5268 bo = 1;
5269 }
5270 if (byteorder)
5271 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005272 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273
Antoine Pitrou63065d72012-05-15 23:48:04 +02005274 if (q == e) {
5275 if (consumed)
5276 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005277 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005278 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005279
Christian Heimes743e0cd2012-10-17 23:52:17 +02005280#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005281 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005282 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005283#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005284 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005285 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005286#endif
Tim Peters772747b2001-08-09 22:21:55 +00005287
Antoine Pitrou63065d72012-05-15 23:48:04 +02005288 /* Note: size will always be longer than the resulting Unicode
5289 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005290 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005291 writer.min_length = (e - q + 1) / 2;
5292 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005293 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005294
Antoine Pitrou63065d72012-05-15 23:48:04 +02005295 while (1) {
5296 Py_UCS4 ch = 0;
5297 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005298 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005299 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005300 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005301 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005302 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005303 native_ordering);
5304 else
5305 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005306 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005307 native_ordering);
5308 } else if (kind == PyUnicode_2BYTE_KIND) {
5309 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005310 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005311 native_ordering);
5312 } else {
5313 assert(kind == PyUnicode_4BYTE_KIND);
5314 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005315 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005316 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005317 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005318 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005319
Antoine Pitrou63065d72012-05-15 23:48:04 +02005320 switch (ch)
5321 {
5322 case 0:
5323 /* remaining byte at the end? (size should be even) */
5324 if (q == e || consumed)
5325 goto End;
5326 errmsg = "truncated data";
5327 startinpos = ((const char *)q) - starts;
5328 endinpos = ((const char *)e) - starts;
5329 break;
5330 /* The remaining input chars are ignored if the callback
5331 chooses to skip the input */
5332 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005333 q -= 2;
5334 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005335 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005336 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005337 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005338 endinpos = ((const char *)e) - starts;
5339 break;
5340 case 2:
5341 errmsg = "illegal encoding";
5342 startinpos = ((const char *)q) - 2 - starts;
5343 endinpos = startinpos + 2;
5344 break;
5345 case 3:
5346 errmsg = "illegal UTF-16 surrogate";
5347 startinpos = ((const char *)q) - 4 - starts;
5348 endinpos = startinpos + 2;
5349 break;
5350 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005351 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005352 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005353 continue;
5354 }
5355
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005356 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005357 errors,
5358 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005359 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005360 &starts,
5361 (const char **)&e,
5362 &startinpos,
5363 &endinpos,
5364 &exc,
5365 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005366 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005367 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368 }
5369
Antoine Pitrou63065d72012-05-15 23:48:04 +02005370End:
Walter Dörwald69652032004-09-07 20:24:22 +00005371 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005372 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005373
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005374 Py_XDECREF(errorHandler);
5375 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005376 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377
Benjamin Peterson29060642009-01-31 22:14:21 +00005378 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005379 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005380 Py_XDECREF(errorHandler);
5381 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382 return NULL;
5383}
5384
Tim Peters772747b2001-08-09 22:21:55 +00005385PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005386_PyUnicode_EncodeUTF16(PyObject *str,
5387 const char *errors,
5388 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005390 enum PyUnicode_Kind kind;
5391 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005392 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005393 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005394 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005395 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005396#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005397 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005398#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005399 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005400#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005401 const char *encoding;
5402 Py_ssize_t nsize, pos;
5403 PyObject *errorHandler = NULL;
5404 PyObject *exc = NULL;
5405 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005406
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005407 if (!PyUnicode_Check(str)) {
5408 PyErr_BadArgument();
5409 return NULL;
5410 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005411 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005412 return NULL;
5413 kind = PyUnicode_KIND(str);
5414 data = PyUnicode_DATA(str);
5415 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005416
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005417 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005418 if (kind == PyUnicode_4BYTE_KIND) {
5419 const Py_UCS4 *in = (const Py_UCS4 *)data;
5420 const Py_UCS4 *end = in + len;
5421 while (in < end)
5422 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005423 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005424 }
5425 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005426 return PyErr_NoMemory();
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005427 nsize = len + pairs + (byteorder == 0);
5428 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429 if (v == NULL)
5430 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005432 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005433 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005434 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005436 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005437 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005438 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005439
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005440 if (kind == PyUnicode_1BYTE_KIND) {
5441 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5442 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005443 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005444
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005445 if (byteorder < 0)
5446 encoding = "utf-16-le";
5447 else if (byteorder > 0)
5448 encoding = "utf-16-be";
5449 else
5450 encoding = "utf-16";
5451
5452 pos = 0;
5453 while (pos < len) {
5454 Py_ssize_t repsize, moreunits;
5455
5456 if (kind == PyUnicode_2BYTE_KIND) {
5457 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5458 &out, native_ordering);
5459 }
5460 else {
5461 assert(kind == PyUnicode_4BYTE_KIND);
5462 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5463 &out, native_ordering);
5464 }
5465 if (pos == len)
5466 break;
5467
5468 rep = unicode_encode_call_errorhandler(
5469 errors, &errorHandler,
5470 encoding, "surrogates not allowed",
5471 str, &exc, pos, pos + 1, &pos);
5472 if (!rep)
5473 goto error;
5474
5475 if (PyBytes_Check(rep)) {
5476 repsize = PyBytes_GET_SIZE(rep);
5477 if (repsize & 1) {
5478 raise_encode_exception(&exc, encoding,
5479 str, pos - 1, pos,
5480 "surrogates not allowed");
5481 goto error;
5482 }
5483 moreunits = repsize / 2;
5484 }
5485 else {
5486 assert(PyUnicode_Check(rep));
5487 if (PyUnicode_READY(rep) < 0)
5488 goto error;
5489 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5490 if (!PyUnicode_IS_ASCII(rep)) {
5491 raise_encode_exception(&exc, encoding,
5492 str, pos - 1, pos,
5493 "surrogates not allowed");
5494 goto error;
5495 }
5496 }
5497
5498 /* two bytes are reserved for each surrogate */
5499 if (moreunits > 1) {
5500 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5501 Py_ssize_t morebytes = 2 * (moreunits - 1);
5502 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5503 /* integer overflow */
5504 PyErr_NoMemory();
5505 goto error;
5506 }
5507 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5508 goto error;
5509 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5510 }
5511
5512 if (PyBytes_Check(rep)) {
5513 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5514 out += moreunits;
5515 } else /* rep is unicode */ {
5516 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5517 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5518 &out, native_ordering);
5519 }
5520
5521 Py_CLEAR(rep);
5522 }
5523
5524 /* Cut back to size actually needed. This is necessary for, for example,
5525 encoding of a string containing isolated surrogates and the 'ignore' handler
5526 is used. */
5527 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5528 if (nsize != PyBytes_GET_SIZE(v))
5529 _PyBytes_Resize(&v, nsize);
5530 Py_XDECREF(errorHandler);
5531 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005532 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005533 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005534 error:
5535 Py_XDECREF(rep);
5536 Py_XDECREF(errorHandler);
5537 Py_XDECREF(exc);
5538 Py_XDECREF(v);
5539 return NULL;
5540#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541}
5542
Alexander Belopolsky40018472011-02-26 01:02:56 +00005543PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005544PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5545 Py_ssize_t size,
5546 const char *errors,
5547 int byteorder)
5548{
5549 PyObject *result;
5550 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5551 if (tmp == NULL)
5552 return NULL;
5553 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5554 Py_DECREF(tmp);
5555 return result;
5556}
5557
5558PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005559PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005561 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562}
5563
5564/* --- Unicode Escape Codec ----------------------------------------------- */
5565
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005566/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5567 if all the escapes in the string make it still a valid ASCII string.
5568 Returns -1 if any escapes were found which cause the string to
5569 pop out of ASCII range. Otherwise returns the length of the
5570 required buffer to hold the string.
5571 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005572static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005573length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5574{
5575 const unsigned char *p = (const unsigned char *)s;
5576 const unsigned char *end = p + size;
5577 Py_ssize_t length = 0;
5578
5579 if (size < 0)
5580 return -1;
5581
5582 for (; p < end; ++p) {
5583 if (*p > 127) {
5584 /* Non-ASCII */
5585 return -1;
5586 }
5587 else if (*p != '\\') {
5588 /* Normal character */
5589 ++length;
5590 }
5591 else {
5592 /* Backslash-escape, check next char */
5593 ++p;
5594 /* Escape sequence reaches till end of string or
5595 non-ASCII follow-up. */
5596 if (p >= end || *p > 127)
5597 return -1;
5598 switch (*p) {
5599 case '\n':
5600 /* backslash + \n result in zero characters */
5601 break;
5602 case '\\': case '\'': case '\"':
5603 case 'b': case 'f': case 't':
5604 case 'n': case 'r': case 'v': case 'a':
5605 ++length;
5606 break;
5607 case '0': case '1': case '2': case '3':
5608 case '4': case '5': case '6': case '7':
5609 case 'x': case 'u': case 'U': case 'N':
5610 /* these do not guarantee ASCII characters */
5611 return -1;
5612 default:
5613 /* count the backslash + the other character */
5614 length += 2;
5615 }
5616 }
5617 }
5618 return length;
5619}
5620
Fredrik Lundh06d12682001-01-24 07:59:11 +00005621static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005622
Alexander Belopolsky40018472011-02-26 01:02:56 +00005623PyObject *
5624PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005625 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005626 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005628 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005629 Py_ssize_t startinpos;
5630 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005631 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005633 char* message;
5634 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005635 PyObject *errorHandler = NULL;
5636 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005637 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005638
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005639 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005640 if (len == 0)
5641 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005642
5643 /* After length_of_escaped_ascii_string() there are two alternatives,
5644 either the string is pure ASCII with named escapes like \n, etc.
5645 and we determined it's exact size (common case)
5646 or it contains \x, \u, ... escape sequences. then we create a
5647 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005648 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005649 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005650 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005651 }
5652 else {
5653 /* Escaped strings will always be longer than the resulting
5654 Unicode string, so we start with size here and then reduce the
5655 length after conversion to the true value.
5656 (but if the error callback returns a long replacement string
5657 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005658 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005659 }
5660
Guido van Rossumd57fd912000-03-10 22:53:23 +00005661 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005662 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005664
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665 while (s < end) {
5666 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005667 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005668 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669
5670 /* Non-escape characters are interpreted as Unicode ordinals */
5671 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005672 x = (unsigned char)*s;
5673 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005674 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005675 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676 continue;
5677 }
5678
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005679 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680 /* \ - Escapes */
5681 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005682 c = *s++;
5683 if (s > end)
5684 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005685
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005686 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687
Benjamin Peterson29060642009-01-31 22:14:21 +00005688 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005689#define WRITECHAR(ch) \
5690 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005691 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005692 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005693 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005694
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005696 case '\\': WRITECHAR('\\'); break;
5697 case '\'': WRITECHAR('\''); break;
5698 case '\"': WRITECHAR('\"'); break;
5699 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005700 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005701 case 'f': WRITECHAR('\014'); break;
5702 case 't': WRITECHAR('\t'); break;
5703 case 'n': WRITECHAR('\n'); break;
5704 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005705 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005706 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005707 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005708 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709
Benjamin Peterson29060642009-01-31 22:14:21 +00005710 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711 case '0': case '1': case '2': case '3':
5712 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005713 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005714 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005715 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005716 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005717 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005718 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005719 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720 break;
5721
Benjamin Peterson29060642009-01-31 22:14:21 +00005722 /* hex escapes */
5723 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005725 digits = 2;
5726 message = "truncated \\xXX escape";
5727 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728
Benjamin Peterson29060642009-01-31 22:14:21 +00005729 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005731 digits = 4;
5732 message = "truncated \\uXXXX escape";
5733 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734
Benjamin Peterson29060642009-01-31 22:14:21 +00005735 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005736 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005737 digits = 8;
5738 message = "truncated \\UXXXXXXXX escape";
5739 hexescape:
5740 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005741 if (end - s < digits) {
5742 /* count only hex digits */
5743 for (; s < end; ++s) {
5744 c = (unsigned char)*s;
5745 if (!Py_ISXDIGIT(c))
5746 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005747 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005748 goto error;
5749 }
5750 for (; digits--; ++s) {
5751 c = (unsigned char)*s;
5752 if (!Py_ISXDIGIT(c))
5753 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005754 chr = (chr<<4) & ~0xF;
5755 if (c >= '0' && c <= '9')
5756 chr += c - '0';
5757 else if (c >= 'a' && c <= 'f')
5758 chr += 10 + c - 'a';
5759 else
5760 chr += 10 + c - 'A';
5761 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005762 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005763 /* _decoding_error will have already written into the
5764 target buffer. */
5765 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005766 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005767 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005768 message = "illegal Unicode character";
5769 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005770 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005771 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005772 break;
5773
Benjamin Peterson29060642009-01-31 22:14:21 +00005774 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005775 case 'N':
5776 message = "malformed \\N character escape";
5777 if (ucnhash_CAPI == NULL) {
5778 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005779 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5780 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005781 if (ucnhash_CAPI == NULL)
5782 goto ucnhashError;
5783 }
5784 if (*s == '{') {
5785 const char *start = s+1;
5786 /* look for the closing brace */
5787 while (*s != '}' && s < end)
5788 s++;
5789 if (s > start && s < end && *s == '}') {
5790 /* found a name. look it up in the unicode database */
5791 message = "unknown Unicode character name";
5792 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005793 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005794 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005795 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005796 goto store;
5797 }
5798 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005799 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005800
5801 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005802 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005803 message = "\\ at end of string";
5804 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005805 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005806 }
5807 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005808 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005809 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005810 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005811 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005813 continue;
5814
5815 error:
5816 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005817 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005818 errors, &errorHandler,
5819 "unicodeescape", message,
5820 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005821 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005822 goto onError;
5823 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005824 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005825#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005826
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005827 Py_XDECREF(errorHandler);
5828 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005829 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005830
Benjamin Peterson29060642009-01-31 22:14:21 +00005831 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005832 PyErr_SetString(
5833 PyExc_UnicodeError,
5834 "\\N escapes not supported (can't load unicodedata module)"
5835 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005836 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005837 Py_XDECREF(errorHandler);
5838 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005839 return NULL;
5840
Benjamin Peterson29060642009-01-31 22:14:21 +00005841 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005842 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005843 Py_XDECREF(errorHandler);
5844 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845 return NULL;
5846}
5847
5848/* Return a Unicode-Escape string version of the Unicode object.
5849
5850 If quotes is true, the string is enclosed in u"" or u'' quotes as
5851 appropriate.
5852
5853*/
5854
Alexander Belopolsky40018472011-02-26 01:02:56 +00005855PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005856PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005858 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005859 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005860 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005861 int kind;
5862 void *data;
5863 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864
Ezio Melottie7f90372012-10-05 03:33:31 +03005865 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005866 escape.
5867
Ezio Melottie7f90372012-10-05 03:33:31 +03005868 For UCS1 strings it's '\xxx', 4 bytes per source character.
5869 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5870 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005871 */
5872
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005873 if (!PyUnicode_Check(unicode)) {
5874 PyErr_BadArgument();
5875 return NULL;
5876 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005877 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005878 return NULL;
5879 len = PyUnicode_GET_LENGTH(unicode);
5880 kind = PyUnicode_KIND(unicode);
5881 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005882 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005883 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5884 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5885 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5886 }
5887
5888 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005889 return PyBytes_FromStringAndSize(NULL, 0);
5890
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005891 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005892 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005893
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005894 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005895 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005896 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005897 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005898 if (repr == NULL)
5899 return NULL;
5900
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005901 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005903 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005904 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005905
Walter Dörwald79e913e2007-05-12 11:08:06 +00005906 /* Escape backslashes */
5907 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908 *p++ = '\\';
5909 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005910 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005911 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005912
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005913 /* Map 21-bit characters to '\U00xxxxxx' */
5914 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005915 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005916 *p++ = '\\';
5917 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005918 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5919 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5920 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5921 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5922 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5923 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5924 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5925 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005926 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005927 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005928
Guido van Rossumd57fd912000-03-10 22:53:23 +00005929 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005930 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931 *p++ = '\\';
5932 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005933 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5934 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5935 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5936 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005938
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005939 /* Map special whitespace to '\t', \n', '\r' */
5940 else if (ch == '\t') {
5941 *p++ = '\\';
5942 *p++ = 't';
5943 }
5944 else if (ch == '\n') {
5945 *p++ = '\\';
5946 *p++ = 'n';
5947 }
5948 else if (ch == '\r') {
5949 *p++ = '\\';
5950 *p++ = 'r';
5951 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005952
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005953 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005954 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005956 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005957 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5958 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005959 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005960
Guido van Rossumd57fd912000-03-10 22:53:23 +00005961 /* Copy everything else as-is */
5962 else
5963 *p++ = (char) ch;
5964 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005966 assert(p - PyBytes_AS_STRING(repr) > 0);
5967 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5968 return NULL;
5969 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970}
5971
Alexander Belopolsky40018472011-02-26 01:02:56 +00005972PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005973PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5974 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005975{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005976 PyObject *result;
5977 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5978 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005980 result = PyUnicode_AsUnicodeEscapeString(tmp);
5981 Py_DECREF(tmp);
5982 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983}
5984
5985/* --- Raw Unicode Escape Codec ------------------------------------------- */
5986
Alexander Belopolsky40018472011-02-26 01:02:56 +00005987PyObject *
5988PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005989 Py_ssize_t size,
5990 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005992 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005993 Py_ssize_t startinpos;
5994 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005995 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996 const char *end;
5997 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005998 PyObject *errorHandler = NULL;
5999 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006000
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006001 if (size == 0)
6002 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006003
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004 /* Escaped strings will always be longer than the resulting
6005 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006006 length after conversion to the true value. (But decoding error
6007 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006008 _PyUnicodeWriter_Init(&writer);
6009 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006010
Guido van Rossumd57fd912000-03-10 22:53:23 +00006011 end = s + size;
6012 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006013 unsigned char c;
6014 Py_UCS4 x;
6015 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006016 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017
Benjamin Peterson29060642009-01-31 22:14:21 +00006018 /* Non-escape characters are interpreted as Unicode ordinals */
6019 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006020 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006021 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006022 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006023 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006024 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006025 startinpos = s-starts;
6026
6027 /* \u-escapes are only interpreted iff the number of leading
6028 backslashes if odd */
6029 bs = s;
6030 for (;s < end;) {
6031 if (*s != '\\')
6032 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006033 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006034 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006035 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006036 }
6037 if (((s - bs) & 1) == 0 ||
6038 s >= end ||
6039 (*s != 'u' && *s != 'U')) {
6040 continue;
6041 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006042 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006043 count = *s=='u' ? 4 : 8;
6044 s++;
6045
6046 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006047 for (x = 0, i = 0; i < count; ++i, ++s) {
6048 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006049 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006050 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006051 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006052 errors, &errorHandler,
6053 "rawunicodeescape", "truncated \\uXXXX",
6054 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006055 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006056 goto onError;
6057 goto nextByte;
6058 }
6059 x = (x<<4) & ~0xF;
6060 if (c >= '0' && c <= '9')
6061 x += c - '0';
6062 else if (c >= 'a' && c <= 'f')
6063 x += 10 + c - 'a';
6064 else
6065 x += 10 + c - 'A';
6066 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006067 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006068 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006069 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006070 }
6071 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006072 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006073 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006074 errors, &errorHandler,
6075 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006076 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006077 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006078 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006079 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006080 nextByte:
6081 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006083 Py_XDECREF(errorHandler);
6084 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006085 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006086
Benjamin Peterson29060642009-01-31 22:14:21 +00006087 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006088 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006089 Py_XDECREF(errorHandler);
6090 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006091 return NULL;
6092}
6093
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006094
Alexander Belopolsky40018472011-02-26 01:02:56 +00006095PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006096PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006097{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006098 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099 char *p;
6100 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006101 Py_ssize_t expandsize, pos;
6102 int kind;
6103 void *data;
6104 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006105
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006106 if (!PyUnicode_Check(unicode)) {
6107 PyErr_BadArgument();
6108 return NULL;
6109 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006110 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006111 return NULL;
6112 kind = PyUnicode_KIND(unicode);
6113 data = PyUnicode_DATA(unicode);
6114 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006115 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6116 bytes, and 1 byte characters 4. */
6117 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006118
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006119 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006120 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006121
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006122 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123 if (repr == NULL)
6124 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006125 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006126 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006128 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006129 for (pos = 0; pos < len; pos++) {
6130 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006131 /* Map 32-bit characters to '\Uxxxxxxxx' */
6132 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006133 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006134 *p++ = '\\';
6135 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006136 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6137 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6138 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6139 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6140 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6141 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6142 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6143 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006144 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006145 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006146 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147 *p++ = '\\';
6148 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006149 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6150 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6151 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6152 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006154 /* Copy everything else as-is */
6155 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156 *p++ = (char) ch;
6157 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006158
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006159 assert(p > q);
6160 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006161 return NULL;
6162 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163}
6164
Alexander Belopolsky40018472011-02-26 01:02:56 +00006165PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006166PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6167 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006169 PyObject *result;
6170 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6171 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006172 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006173 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6174 Py_DECREF(tmp);
6175 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176}
6177
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006178/* --- Unicode Internal Codec ------------------------------------------- */
6179
Alexander Belopolsky40018472011-02-26 01:02:56 +00006180PyObject *
6181_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006182 Py_ssize_t size,
6183 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006184{
6185 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006186 Py_ssize_t startinpos;
6187 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006188 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006189 const char *end;
6190 const char *reason;
6191 PyObject *errorHandler = NULL;
6192 PyObject *exc = NULL;
6193
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006194 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006195 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006196 1))
6197 return NULL;
6198
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006199 if (size == 0)
6200 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006201
Victor Stinner8f674cc2013-04-17 23:02:17 +02006202 _PyUnicodeWriter_Init(&writer);
6203 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6204 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006205 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006206 }
6207 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006208
Victor Stinner8f674cc2013-04-17 23:02:17 +02006209 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006210 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006211 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006212 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006213 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006214 endinpos = end-starts;
6215 reason = "truncated input";
6216 goto error;
6217 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006218 /* We copy the raw representation one byte at a time because the
6219 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006220 ((char *) &uch)[0] = s[0];
6221 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006222#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006223 ((char *) &uch)[2] = s[2];
6224 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006225#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006226 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006227#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006228 /* We have to sanity check the raw data, otherwise doom looms for
6229 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006230 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006231 endinpos = s - starts + Py_UNICODE_SIZE;
6232 reason = "illegal code point (> 0x10FFFF)";
6233 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006234 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006235#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006236 s += Py_UNICODE_SIZE;
6237#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006238 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006239 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006240 Py_UNICODE uch2;
6241 ((char *) &uch2)[0] = s[0];
6242 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006243 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006244 {
Victor Stinner551ac952011-11-29 22:58:13 +01006245 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006246 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006247 }
6248 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006249#endif
6250
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006251 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006252 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006253 continue;
6254
6255 error:
6256 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006257 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006258 errors, &errorHandler,
6259 "unicode_internal", reason,
6260 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006261 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006262 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006263 }
6264
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006265 Py_XDECREF(errorHandler);
6266 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006267 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006268
Benjamin Peterson29060642009-01-31 22:14:21 +00006269 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006270 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006271 Py_XDECREF(errorHandler);
6272 Py_XDECREF(exc);
6273 return NULL;
6274}
6275
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276/* --- Latin-1 Codec ------------------------------------------------------ */
6277
Alexander Belopolsky40018472011-02-26 01:02:56 +00006278PyObject *
6279PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006280 Py_ssize_t size,
6281 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006282{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006284 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285}
6286
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006287/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006288static void
6289make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006290 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006291 PyObject *unicode,
6292 Py_ssize_t startpos, Py_ssize_t endpos,
6293 const char *reason)
6294{
6295 if (*exceptionObject == NULL) {
6296 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006297 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006298 encoding, unicode, startpos, endpos, reason);
6299 }
6300 else {
6301 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6302 goto onError;
6303 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6304 goto onError;
6305 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6306 goto onError;
6307 return;
6308 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006309 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006310 }
6311}
6312
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006313/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006314static void
6315raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006316 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006317 PyObject *unicode,
6318 Py_ssize_t startpos, Py_ssize_t endpos,
6319 const char *reason)
6320{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006321 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006322 encoding, unicode, startpos, endpos, reason);
6323 if (*exceptionObject != NULL)
6324 PyCodec_StrictErrors(*exceptionObject);
6325}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006326
6327/* error handling callback helper:
6328 build arguments, call the callback and check the arguments,
6329 put the result into newpos and return the replacement string, which
6330 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006331static PyObject *
6332unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006333 PyObject **errorHandler,
6334 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006335 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006336 Py_ssize_t startpos, Py_ssize_t endpos,
6337 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006338{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006339 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006340 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006341 PyObject *restuple;
6342 PyObject *resunicode;
6343
6344 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006345 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006346 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006347 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006348 }
6349
Benjamin Petersonbac79492012-01-14 13:34:47 -05006350 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006351 return NULL;
6352 len = PyUnicode_GET_LENGTH(unicode);
6353
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006354 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006355 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006356 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006357 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006358
6359 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006360 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006361 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006362 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006363 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006364 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006365 Py_DECREF(restuple);
6366 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006367 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006368 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006369 &resunicode, newpos)) {
6370 Py_DECREF(restuple);
6371 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006372 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006373 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6374 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6375 Py_DECREF(restuple);
6376 return NULL;
6377 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006378 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006379 *newpos = len + *newpos;
6380 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006381 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006382 Py_DECREF(restuple);
6383 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006384 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006385 Py_INCREF(resunicode);
6386 Py_DECREF(restuple);
6387 return resunicode;
6388}
6389
Alexander Belopolsky40018472011-02-26 01:02:56 +00006390static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006391unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006392 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006393 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006394{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006395 /* input state */
6396 Py_ssize_t pos=0, size;
6397 int kind;
6398 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006399 /* output object */
6400 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006401 /* pointer into the output */
6402 char *str;
6403 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006404 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006405 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6406 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006407 PyObject *errorHandler = NULL;
6408 PyObject *exc = NULL;
6409 /* the following variable is used for caching string comparisons
6410 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6411 int known_errorHandler = -1;
6412
Benjamin Petersonbac79492012-01-14 13:34:47 -05006413 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006414 return NULL;
6415 size = PyUnicode_GET_LENGTH(unicode);
6416 kind = PyUnicode_KIND(unicode);
6417 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006418 /* allocate enough for a simple encoding without
6419 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006420 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006421 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006422 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006423 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006424 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006425 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006426 ressize = size;
6427
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006428 while (pos < size) {
6429 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006430
Benjamin Peterson29060642009-01-31 22:14:21 +00006431 /* can we encode this? */
6432 if (c<limit) {
6433 /* no overflow check, because we know that the space is enough */
6434 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006435 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006436 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006437 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006438 Py_ssize_t requiredsize;
6439 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006440 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006442 Py_ssize_t collstart = pos;
6443 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006444 /* find all unecodable characters */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006445 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006446 ++collend;
6447 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6448 if (known_errorHandler==-1) {
6449 if ((errors==NULL) || (!strcmp(errors, "strict")))
6450 known_errorHandler = 1;
6451 else if (!strcmp(errors, "replace"))
6452 known_errorHandler = 2;
6453 else if (!strcmp(errors, "ignore"))
6454 known_errorHandler = 3;
6455 else if (!strcmp(errors, "xmlcharrefreplace"))
6456 known_errorHandler = 4;
6457 else
6458 known_errorHandler = 0;
6459 }
6460 switch (known_errorHandler) {
6461 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006462 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006463 goto onError;
6464 case 2: /* replace */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006465 while (collstart++ < collend)
Benjamin Peterson29060642009-01-31 22:14:21 +00006466 *str++ = '?'; /* fall through */
6467 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006468 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006469 break;
6470 case 4: /* xmlcharrefreplace */
6471 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006472 requiredsize = respos;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006473 /* determine replacement size */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006474 for (i = collstart; i < collend; ++i) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006475 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006476 Py_ssize_t incr;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006477 if (ch < 10)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006478 incr = 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006479 else if (ch < 100)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006480 incr = 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006481 else if (ch < 1000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006482 incr = 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006483 else if (ch < 10000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006484 incr = 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006485 else if (ch < 100000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006486 incr = 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006487 else if (ch < 1000000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006488 incr = 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006489 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006490 assert(ch <= MAX_UNICODE);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006491 incr = 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006492 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006493 if (requiredsize > PY_SSIZE_T_MAX - incr)
6494 goto overflow;
6495 requiredsize += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +00006496 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006497 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6498 goto overflow;
6499 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006500 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006501 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006502 requiredsize = 2*ressize;
6503 if (_PyBytes_Resize(&res, requiredsize))
6504 goto onError;
6505 str = PyBytes_AS_STRING(res) + respos;
6506 ressize = requiredsize;
6507 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006508 /* generate replacement */
6509 for (i = collstart; i < collend; ++i) {
6510 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006511 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006512 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006513 break;
6514 default:
6515 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006516 encoding, reason, unicode, &exc,
6517 collstart, collend, &newpos);
6518 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006519 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006520 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006521 if (PyBytes_Check(repunicode)) {
6522 /* Directly copy bytes result to output. */
6523 repsize = PyBytes_Size(repunicode);
6524 if (repsize > 1) {
6525 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006526 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006527 if (ressize > PY_SSIZE_T_MAX - repsize - 1) {
6528 Py_DECREF(repunicode);
6529 goto overflow;
6530 }
Martin v. Löwis011e8422009-05-05 04:43:17 +00006531 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6532 Py_DECREF(repunicode);
6533 goto onError;
6534 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006535 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006536 ressize += repsize-1;
6537 }
6538 memcpy(str, PyBytes_AsString(repunicode), repsize);
6539 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006540 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006541 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006542 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006543 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006544 /* need more space? (at least enough for what we
6545 have+the replacement+the rest of the string, so
6546 we won't have to check space for encodable characters) */
6547 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006548 repsize = PyUnicode_GET_LENGTH(repunicode);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006549 requiredsize = respos;
6550 if (requiredsize > PY_SSIZE_T_MAX - repsize)
6551 goto overflow;
6552 requiredsize += repsize;
6553 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6554 goto overflow;
6555 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006556 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006557 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006558 requiredsize = 2*ressize;
6559 if (_PyBytes_Resize(&res, requiredsize)) {
6560 Py_DECREF(repunicode);
6561 goto onError;
6562 }
6563 str = PyBytes_AS_STRING(res) + respos;
6564 ressize = requiredsize;
6565 }
6566 /* check if there is anything unencodable in the replacement
6567 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006568 for (i = 0; repsize-->0; ++i, ++str) {
6569 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006570 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006571 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006572 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006573 Py_DECREF(repunicode);
6574 goto onError;
6575 }
6576 *str = (char)c;
6577 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006578 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006579 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006580 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006581 }
6582 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006583 /* Resize if we allocated to much */
6584 size = str - PyBytes_AS_STRING(res);
6585 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006586 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006587 if (_PyBytes_Resize(&res, size) < 0)
6588 goto onError;
6589 }
6590
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006591 Py_XDECREF(errorHandler);
6592 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006593 return res;
6594
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006595 overflow:
6596 PyErr_SetString(PyExc_OverflowError,
6597 "encoded result is too long for a Python string");
6598
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006599 onError:
6600 Py_XDECREF(res);
6601 Py_XDECREF(errorHandler);
6602 Py_XDECREF(exc);
6603 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006604}
6605
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006606/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006607PyObject *
6608PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006609 Py_ssize_t size,
6610 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006612 PyObject *result;
6613 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6614 if (unicode == NULL)
6615 return NULL;
6616 result = unicode_encode_ucs1(unicode, errors, 256);
6617 Py_DECREF(unicode);
6618 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619}
6620
Alexander Belopolsky40018472011-02-26 01:02:56 +00006621PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006622_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623{
6624 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006625 PyErr_BadArgument();
6626 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006628 if (PyUnicode_READY(unicode) == -1)
6629 return NULL;
6630 /* Fast path: if it is a one-byte string, construct
6631 bytes object directly. */
6632 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6633 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6634 PyUnicode_GET_LENGTH(unicode));
6635 /* Non-Latin-1 characters present. Defer to above function to
6636 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006637 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006638}
6639
6640PyObject*
6641PyUnicode_AsLatin1String(PyObject *unicode)
6642{
6643 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644}
6645
6646/* --- 7-bit ASCII Codec -------------------------------------------------- */
6647
Alexander Belopolsky40018472011-02-26 01:02:56 +00006648PyObject *
6649PyUnicode_DecodeASCII(const char *s,
6650 Py_ssize_t size,
6651 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006653 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006654 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006655 int kind;
6656 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006657 Py_ssize_t startinpos;
6658 Py_ssize_t endinpos;
6659 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006660 const char *e;
6661 PyObject *errorHandler = NULL;
6662 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006663
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006665 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006666
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006668 if (size == 1 && (unsigned char)s[0] < 128)
6669 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006670
Victor Stinner8f674cc2013-04-17 23:02:17 +02006671 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006672 writer.min_length = size;
6673 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006674 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006675
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006676 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006677 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006678 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006679 writer.pos = outpos;
6680 if (writer.pos == size)
6681 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006682
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006683 s += writer.pos;
6684 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006685 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006686 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006687 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006688 PyUnicode_WRITE(kind, data, writer.pos, c);
6689 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006690 ++s;
6691 }
6692 else {
6693 startinpos = s-starts;
6694 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006695 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006696 errors, &errorHandler,
6697 "ascii", "ordinal not in range(128)",
6698 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006699 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006700 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006701 kind = writer.kind;
6702 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006703 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006705 Py_XDECREF(errorHandler);
6706 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006707 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006708
Benjamin Peterson29060642009-01-31 22:14:21 +00006709 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006710 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006711 Py_XDECREF(errorHandler);
6712 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713 return NULL;
6714}
6715
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006716/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006717PyObject *
6718PyUnicode_EncodeASCII(const Py_UNICODE *p,
6719 Py_ssize_t size,
6720 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006721{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006722 PyObject *result;
6723 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6724 if (unicode == NULL)
6725 return NULL;
6726 result = unicode_encode_ucs1(unicode, errors, 128);
6727 Py_DECREF(unicode);
6728 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006729}
6730
Alexander Belopolsky40018472011-02-26 01:02:56 +00006731PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006732_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733{
6734 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006735 PyErr_BadArgument();
6736 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006738 if (PyUnicode_READY(unicode) == -1)
6739 return NULL;
6740 /* Fast path: if it is an ASCII-only string, construct bytes object
6741 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006742 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006743 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6744 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006745 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006746}
6747
6748PyObject *
6749PyUnicode_AsASCIIString(PyObject *unicode)
6750{
6751 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752}
6753
Victor Stinner99b95382011-07-04 14:23:54 +02006754#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006755
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006756/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006757
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006758#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006759#define NEED_RETRY
6760#endif
6761
Victor Stinner3a50e702011-10-18 21:21:00 +02006762#ifndef WC_ERR_INVALID_CHARS
6763# define WC_ERR_INVALID_CHARS 0x0080
6764#endif
6765
6766static char*
6767code_page_name(UINT code_page, PyObject **obj)
6768{
6769 *obj = NULL;
6770 if (code_page == CP_ACP)
6771 return "mbcs";
6772 if (code_page == CP_UTF7)
6773 return "CP_UTF7";
6774 if (code_page == CP_UTF8)
6775 return "CP_UTF8";
6776
6777 *obj = PyBytes_FromFormat("cp%u", code_page);
6778 if (*obj == NULL)
6779 return NULL;
6780 return PyBytes_AS_STRING(*obj);
6781}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006782
Victor Stinner3a50e702011-10-18 21:21:00 +02006783static DWORD
6784decode_code_page_flags(UINT code_page)
6785{
6786 if (code_page == CP_UTF7) {
6787 /* The CP_UTF7 decoder only supports flags=0 */
6788 return 0;
6789 }
6790 else
6791 return MB_ERR_INVALID_CHARS;
6792}
6793
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006794/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006795 * Decode a byte string from a Windows code page into unicode object in strict
6796 * mode.
6797 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006798 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6799 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006800 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006801static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006802decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006803 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006804 const char *in,
6805 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006806{
Victor Stinner3a50e702011-10-18 21:21:00 +02006807 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006808 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006809 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006810
6811 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006812 assert(insize > 0);
6813 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6814 if (outsize <= 0)
6815 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006816
6817 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006818 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006819 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006820 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006821 if (*v == NULL)
6822 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006823 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006824 }
6825 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006826 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006827 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006828 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006829 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006830 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006831 }
6832
6833 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006834 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6835 if (outsize <= 0)
6836 goto error;
6837 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006838
Victor Stinner3a50e702011-10-18 21:21:00 +02006839error:
6840 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6841 return -2;
6842 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006843 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006844}
6845
Victor Stinner3a50e702011-10-18 21:21:00 +02006846/*
6847 * Decode a byte string from a code page into unicode object with an error
6848 * handler.
6849 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006850 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006851 * UnicodeDecodeError exception and returns -1 on error.
6852 */
6853static int
6854decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006855 PyObject **v,
6856 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01006857 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02006858{
6859 const char *startin = in;
6860 const char *endin = in + size;
6861 const DWORD flags = decode_code_page_flags(code_page);
6862 /* Ideally, we should get reason from FormatMessage. This is the Windows
6863 2000 English version of the message. */
6864 const char *reason = "No mapping for the Unicode character exists "
6865 "in the target code page.";
6866 /* each step cannot decode more than 1 character, but a character can be
6867 represented as a surrogate pair */
6868 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02006869 int insize;
6870 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006871 PyObject *errorHandler = NULL;
6872 PyObject *exc = NULL;
6873 PyObject *encoding_obj = NULL;
6874 char *encoding;
6875 DWORD err;
6876 int ret = -1;
6877
6878 assert(size > 0);
6879
6880 encoding = code_page_name(code_page, &encoding_obj);
6881 if (encoding == NULL)
6882 return -1;
6883
Victor Stinner7d00cc12014-03-17 23:08:06 +01006884 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02006885 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6886 UnicodeDecodeError. */
6887 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6888 if (exc != NULL) {
6889 PyCodec_StrictErrors(exc);
6890 Py_CLEAR(exc);
6891 }
6892 goto error;
6893 }
6894
6895 if (*v == NULL) {
6896 /* Create unicode object */
6897 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6898 PyErr_NoMemory();
6899 goto error;
6900 }
Victor Stinnerab595942011-12-17 04:59:06 +01006901 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006902 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006903 if (*v == NULL)
6904 goto error;
6905 startout = PyUnicode_AS_UNICODE(*v);
6906 }
6907 else {
6908 /* Extend unicode object */
6909 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6910 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6911 PyErr_NoMemory();
6912 goto error;
6913 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006914 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006915 goto error;
6916 startout = PyUnicode_AS_UNICODE(*v) + n;
6917 }
6918
6919 /* Decode the byte string character per character */
6920 out = startout;
6921 while (in < endin)
6922 {
6923 /* Decode a character */
6924 insize = 1;
6925 do
6926 {
6927 outsize = MultiByteToWideChar(code_page, flags,
6928 in, insize,
6929 buffer, Py_ARRAY_LENGTH(buffer));
6930 if (outsize > 0)
6931 break;
6932 err = GetLastError();
6933 if (err != ERROR_NO_UNICODE_TRANSLATION
6934 && err != ERROR_INSUFFICIENT_BUFFER)
6935 {
6936 PyErr_SetFromWindowsErr(0);
6937 goto error;
6938 }
6939 insize++;
6940 }
6941 /* 4=maximum length of a UTF-8 sequence */
6942 while (insize <= 4 && (in + insize) <= endin);
6943
6944 if (outsize <= 0) {
6945 Py_ssize_t startinpos, endinpos, outpos;
6946
Victor Stinner7d00cc12014-03-17 23:08:06 +01006947 /* last character in partial decode? */
6948 if (in + insize >= endin && !final)
6949 break;
6950
Victor Stinner3a50e702011-10-18 21:21:00 +02006951 startinpos = in - startin;
6952 endinpos = startinpos + 1;
6953 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006954 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006955 errors, &errorHandler,
6956 encoding, reason,
6957 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006958 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006959 {
6960 goto error;
6961 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006962 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006963 }
6964 else {
6965 in += insize;
6966 memcpy(out, buffer, outsize * sizeof(wchar_t));
6967 out += outsize;
6968 }
6969 }
6970
6971 /* write a NUL character at the end */
6972 *out = 0;
6973
6974 /* Extend unicode object */
6975 outsize = out - startout;
6976 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006977 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006978 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02006979 /* (in - startin) <= size and size is an int */
6980 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02006981
6982error:
6983 Py_XDECREF(encoding_obj);
6984 Py_XDECREF(errorHandler);
6985 Py_XDECREF(exc);
6986 return ret;
6987}
6988
Victor Stinner3a50e702011-10-18 21:21:00 +02006989static PyObject *
6990decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006991 const char *s, Py_ssize_t size,
6992 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006993{
Victor Stinner76a31a62011-11-04 00:05:13 +01006994 PyObject *v = NULL;
6995 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006996
Victor Stinner3a50e702011-10-18 21:21:00 +02006997 if (code_page < 0) {
6998 PyErr_SetString(PyExc_ValueError, "invalid code page number");
6999 return NULL;
7000 }
7001
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007002 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007003 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007004
Victor Stinner76a31a62011-11-04 00:05:13 +01007005 do
7006 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007007#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007008 if (size > INT_MAX) {
7009 chunk_size = INT_MAX;
7010 final = 0;
7011 done = 0;
7012 }
7013 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007014#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007015 {
7016 chunk_size = (int)size;
7017 final = (consumed == NULL);
7018 done = 1;
7019 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007020
Victor Stinner76a31a62011-11-04 00:05:13 +01007021 if (chunk_size == 0 && done) {
7022 if (v != NULL)
7023 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007024 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007025 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007026
Victor Stinner76a31a62011-11-04 00:05:13 +01007027 converted = decode_code_page_strict(code_page, &v,
7028 s, chunk_size);
7029 if (converted == -2)
7030 converted = decode_code_page_errors(code_page, &v,
7031 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007032 errors, final);
7033 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007034
7035 if (converted < 0) {
7036 Py_XDECREF(v);
7037 return NULL;
7038 }
7039
7040 if (consumed)
7041 *consumed += converted;
7042
7043 s += converted;
7044 size -= converted;
7045 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007046
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007047 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007048}
7049
Alexander Belopolsky40018472011-02-26 01:02:56 +00007050PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007051PyUnicode_DecodeCodePageStateful(int code_page,
7052 const char *s,
7053 Py_ssize_t size,
7054 const char *errors,
7055 Py_ssize_t *consumed)
7056{
7057 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7058}
7059
7060PyObject *
7061PyUnicode_DecodeMBCSStateful(const char *s,
7062 Py_ssize_t size,
7063 const char *errors,
7064 Py_ssize_t *consumed)
7065{
7066 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7067}
7068
7069PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007070PyUnicode_DecodeMBCS(const char *s,
7071 Py_ssize_t size,
7072 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007073{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007074 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7075}
7076
Victor Stinner3a50e702011-10-18 21:21:00 +02007077static DWORD
7078encode_code_page_flags(UINT code_page, const char *errors)
7079{
7080 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007081 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007082 }
7083 else if (code_page == CP_UTF7) {
7084 /* CP_UTF7 only supports flags=0 */
7085 return 0;
7086 }
7087 else {
7088 if (errors != NULL && strcmp(errors, "replace") == 0)
7089 return 0;
7090 else
7091 return WC_NO_BEST_FIT_CHARS;
7092 }
7093}
7094
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007095/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007096 * Encode a Unicode string to a Windows code page into a byte string in strict
7097 * mode.
7098 *
7099 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007100 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007101 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007102static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007103encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007104 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007105 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007106{
Victor Stinner554f3f02010-06-16 23:33:54 +00007107 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007108 BOOL *pusedDefaultChar = &usedDefaultChar;
7109 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007110 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007111 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007112 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007113 const DWORD flags = encode_code_page_flags(code_page, NULL);
7114 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007115 /* Create a substring so that we can get the UTF-16 representation
7116 of just the slice under consideration. */
7117 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007118
Martin v. Löwis3d325192011-11-04 18:23:06 +01007119 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007120
Victor Stinner3a50e702011-10-18 21:21:00 +02007121 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007122 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007123 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007124 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007125
Victor Stinner2fc507f2011-11-04 20:06:39 +01007126 substring = PyUnicode_Substring(unicode, offset, offset+len);
7127 if (substring == NULL)
7128 return -1;
7129 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7130 if (p == NULL) {
7131 Py_DECREF(substring);
7132 return -1;
7133 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007134 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007135
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007136 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007137 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007138 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007139 NULL, 0,
7140 NULL, pusedDefaultChar);
7141 if (outsize <= 0)
7142 goto error;
7143 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007144 if (pusedDefaultChar && *pusedDefaultChar) {
7145 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007146 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007147 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007148
Victor Stinner3a50e702011-10-18 21:21:00 +02007149 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007150 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007151 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007152 if (*outbytes == NULL) {
7153 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007154 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007155 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007156 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007157 }
7158 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007159 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007160 const Py_ssize_t n = PyBytes_Size(*outbytes);
7161 if (outsize > PY_SSIZE_T_MAX - n) {
7162 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007163 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007164 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007165 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007166 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7167 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007168 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007169 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007170 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007171 }
7172
7173 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007174 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007175 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007176 out, outsize,
7177 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007178 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007179 if (outsize <= 0)
7180 goto error;
7181 if (pusedDefaultChar && *pusedDefaultChar)
7182 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007183 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007184
Victor Stinner3a50e702011-10-18 21:21:00 +02007185error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007186 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007187 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7188 return -2;
7189 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007190 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007191}
7192
Victor Stinner3a50e702011-10-18 21:21:00 +02007193/*
7194 * Encode a Unicode string to a Windows code page into a byte string using a
7195 * error handler.
7196 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007197 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007198 * -1 on other error.
7199 */
7200static int
7201encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007202 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007203 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007204{
Victor Stinner3a50e702011-10-18 21:21:00 +02007205 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007206 Py_ssize_t pos = unicode_offset;
7207 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007208 /* Ideally, we should get reason from FormatMessage. This is the Windows
7209 2000 English version of the message. */
7210 const char *reason = "invalid character";
7211 /* 4=maximum length of a UTF-8 sequence */
7212 char buffer[4];
7213 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7214 Py_ssize_t outsize;
7215 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007216 PyObject *errorHandler = NULL;
7217 PyObject *exc = NULL;
7218 PyObject *encoding_obj = NULL;
7219 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007220 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007221 PyObject *rep;
7222 int ret = -1;
7223
7224 assert(insize > 0);
7225
7226 encoding = code_page_name(code_page, &encoding_obj);
7227 if (encoding == NULL)
7228 return -1;
7229
7230 if (errors == NULL || strcmp(errors, "strict") == 0) {
7231 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7232 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007233 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007234 if (exc != NULL) {
7235 PyCodec_StrictErrors(exc);
7236 Py_DECREF(exc);
7237 }
7238 Py_XDECREF(encoding_obj);
7239 return -1;
7240 }
7241
7242 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7243 pusedDefaultChar = &usedDefaultChar;
7244 else
7245 pusedDefaultChar = NULL;
7246
7247 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7248 PyErr_NoMemory();
7249 goto error;
7250 }
7251 outsize = insize * Py_ARRAY_LENGTH(buffer);
7252
7253 if (*outbytes == NULL) {
7254 /* Create string object */
7255 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7256 if (*outbytes == NULL)
7257 goto error;
7258 out = PyBytes_AS_STRING(*outbytes);
7259 }
7260 else {
7261 /* Extend string object */
7262 Py_ssize_t n = PyBytes_Size(*outbytes);
7263 if (n > PY_SSIZE_T_MAX - outsize) {
7264 PyErr_NoMemory();
7265 goto error;
7266 }
7267 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7268 goto error;
7269 out = PyBytes_AS_STRING(*outbytes) + n;
7270 }
7271
7272 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007273 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007274 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007275 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7276 wchar_t chars[2];
7277 int charsize;
7278 if (ch < 0x10000) {
7279 chars[0] = (wchar_t)ch;
7280 charsize = 1;
7281 }
7282 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007283 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7284 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007285 charsize = 2;
7286 }
7287
Victor Stinner3a50e702011-10-18 21:21:00 +02007288 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007289 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007290 buffer, Py_ARRAY_LENGTH(buffer),
7291 NULL, pusedDefaultChar);
7292 if (outsize > 0) {
7293 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7294 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007295 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007296 memcpy(out, buffer, outsize);
7297 out += outsize;
7298 continue;
7299 }
7300 }
7301 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7302 PyErr_SetFromWindowsErr(0);
7303 goto error;
7304 }
7305
Victor Stinner3a50e702011-10-18 21:21:00 +02007306 rep = unicode_encode_call_errorhandler(
7307 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007308 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007309 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007310 if (rep == NULL)
7311 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007312 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007313
7314 if (PyBytes_Check(rep)) {
7315 outsize = PyBytes_GET_SIZE(rep);
7316 if (outsize != 1) {
7317 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7318 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7319 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7320 Py_DECREF(rep);
7321 goto error;
7322 }
7323 out = PyBytes_AS_STRING(*outbytes) + offset;
7324 }
7325 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7326 out += outsize;
7327 }
7328 else {
7329 Py_ssize_t i;
7330 enum PyUnicode_Kind kind;
7331 void *data;
7332
Benjamin Petersonbac79492012-01-14 13:34:47 -05007333 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007334 Py_DECREF(rep);
7335 goto error;
7336 }
7337
7338 outsize = PyUnicode_GET_LENGTH(rep);
7339 if (outsize != 1) {
7340 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7341 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7342 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7343 Py_DECREF(rep);
7344 goto error;
7345 }
7346 out = PyBytes_AS_STRING(*outbytes) + offset;
7347 }
7348 kind = PyUnicode_KIND(rep);
7349 data = PyUnicode_DATA(rep);
7350 for (i=0; i < outsize; i++) {
7351 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7352 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007353 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007354 encoding, unicode,
7355 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007356 "unable to encode error handler result to ASCII");
7357 Py_DECREF(rep);
7358 goto error;
7359 }
7360 *out = (unsigned char)ch;
7361 out++;
7362 }
7363 }
7364 Py_DECREF(rep);
7365 }
7366 /* write a NUL byte */
7367 *out = 0;
7368 outsize = out - PyBytes_AS_STRING(*outbytes);
7369 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7370 if (_PyBytes_Resize(outbytes, outsize) < 0)
7371 goto error;
7372 ret = 0;
7373
7374error:
7375 Py_XDECREF(encoding_obj);
7376 Py_XDECREF(errorHandler);
7377 Py_XDECREF(exc);
7378 return ret;
7379}
7380
Victor Stinner3a50e702011-10-18 21:21:00 +02007381static PyObject *
7382encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007383 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007384 const char *errors)
7385{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007386 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007387 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007388 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007389 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007390
Victor Stinner29dacf22015-01-26 16:41:32 +01007391 if (!PyUnicode_Check(unicode)) {
7392 PyErr_BadArgument();
7393 return NULL;
7394 }
7395
Benjamin Petersonbac79492012-01-14 13:34:47 -05007396 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007397 return NULL;
7398 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007399
Victor Stinner3a50e702011-10-18 21:21:00 +02007400 if (code_page < 0) {
7401 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7402 return NULL;
7403 }
7404
Martin v. Löwis3d325192011-11-04 18:23:06 +01007405 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007406 return PyBytes_FromStringAndSize(NULL, 0);
7407
Victor Stinner7581cef2011-11-03 22:32:33 +01007408 offset = 0;
7409 do
7410 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007411#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007412 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007413 chunks. */
7414 if (len > INT_MAX/2) {
7415 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007416 done = 0;
7417 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007418 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007419#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007420 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007421 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007422 done = 1;
7423 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007424
Victor Stinner76a31a62011-11-04 00:05:13 +01007425 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007426 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007427 errors);
7428 if (ret == -2)
7429 ret = encode_code_page_errors(code_page, &outbytes,
7430 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007431 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007432 if (ret < 0) {
7433 Py_XDECREF(outbytes);
7434 return NULL;
7435 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007436
Victor Stinner7581cef2011-11-03 22:32:33 +01007437 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007438 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007439 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007440
Victor Stinner3a50e702011-10-18 21:21:00 +02007441 return outbytes;
7442}
7443
7444PyObject *
7445PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7446 Py_ssize_t size,
7447 const char *errors)
7448{
Victor Stinner7581cef2011-11-03 22:32:33 +01007449 PyObject *unicode, *res;
7450 unicode = PyUnicode_FromUnicode(p, size);
7451 if (unicode == NULL)
7452 return NULL;
7453 res = encode_code_page(CP_ACP, unicode, errors);
7454 Py_DECREF(unicode);
7455 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007456}
7457
7458PyObject *
7459PyUnicode_EncodeCodePage(int code_page,
7460 PyObject *unicode,
7461 const char *errors)
7462{
Victor Stinner7581cef2011-11-03 22:32:33 +01007463 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007464}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007465
Alexander Belopolsky40018472011-02-26 01:02:56 +00007466PyObject *
7467PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007468{
Victor Stinner7581cef2011-11-03 22:32:33 +01007469 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007470}
7471
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007472#undef NEED_RETRY
7473
Victor Stinner99b95382011-07-04 14:23:54 +02007474#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007475
Guido van Rossumd57fd912000-03-10 22:53:23 +00007476/* --- Character Mapping Codec -------------------------------------------- */
7477
Victor Stinnerfb161b12013-04-18 01:44:27 +02007478static int
7479charmap_decode_string(const char *s,
7480 Py_ssize_t size,
7481 PyObject *mapping,
7482 const char *errors,
7483 _PyUnicodeWriter *writer)
7484{
7485 const char *starts = s;
7486 const char *e;
7487 Py_ssize_t startinpos, endinpos;
7488 PyObject *errorHandler = NULL, *exc = NULL;
7489 Py_ssize_t maplen;
7490 enum PyUnicode_Kind mapkind;
7491 void *mapdata;
7492 Py_UCS4 x;
7493 unsigned char ch;
7494
7495 if (PyUnicode_READY(mapping) == -1)
7496 return -1;
7497
7498 maplen = PyUnicode_GET_LENGTH(mapping);
7499 mapdata = PyUnicode_DATA(mapping);
7500 mapkind = PyUnicode_KIND(mapping);
7501
7502 e = s + size;
7503
7504 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7505 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7506 * is disabled in encoding aliases, latin1 is preferred because
7507 * its implementation is faster. */
7508 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7509 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7510 Py_UCS4 maxchar = writer->maxchar;
7511
7512 assert (writer->kind == PyUnicode_1BYTE_KIND);
7513 while (s < e) {
7514 ch = *s;
7515 x = mapdata_ucs1[ch];
7516 if (x > maxchar) {
7517 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7518 goto onError;
7519 maxchar = writer->maxchar;
7520 outdata = (Py_UCS1 *)writer->data;
7521 }
7522 outdata[writer->pos] = x;
7523 writer->pos++;
7524 ++s;
7525 }
7526 return 0;
7527 }
7528
7529 while (s < e) {
7530 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7531 enum PyUnicode_Kind outkind = writer->kind;
7532 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7533 if (outkind == PyUnicode_1BYTE_KIND) {
7534 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7535 Py_UCS4 maxchar = writer->maxchar;
7536 while (s < e) {
7537 ch = *s;
7538 x = mapdata_ucs2[ch];
7539 if (x > maxchar)
7540 goto Error;
7541 outdata[writer->pos] = x;
7542 writer->pos++;
7543 ++s;
7544 }
7545 break;
7546 }
7547 else if (outkind == PyUnicode_2BYTE_KIND) {
7548 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7549 while (s < e) {
7550 ch = *s;
7551 x = mapdata_ucs2[ch];
7552 if (x == 0xFFFE)
7553 goto Error;
7554 outdata[writer->pos] = x;
7555 writer->pos++;
7556 ++s;
7557 }
7558 break;
7559 }
7560 }
7561 ch = *s;
7562
7563 if (ch < maplen)
7564 x = PyUnicode_READ(mapkind, mapdata, ch);
7565 else
7566 x = 0xfffe; /* invalid value */
7567Error:
7568 if (x == 0xfffe)
7569 {
7570 /* undefined mapping */
7571 startinpos = s-starts;
7572 endinpos = startinpos+1;
7573 if (unicode_decode_call_errorhandler_writer(
7574 errors, &errorHandler,
7575 "charmap", "character maps to <undefined>",
7576 &starts, &e, &startinpos, &endinpos, &exc, &s,
7577 writer)) {
7578 goto onError;
7579 }
7580 continue;
7581 }
7582
7583 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7584 goto onError;
7585 ++s;
7586 }
7587 Py_XDECREF(errorHandler);
7588 Py_XDECREF(exc);
7589 return 0;
7590
7591onError:
7592 Py_XDECREF(errorHandler);
7593 Py_XDECREF(exc);
7594 return -1;
7595}
7596
7597static int
7598charmap_decode_mapping(const char *s,
7599 Py_ssize_t size,
7600 PyObject *mapping,
7601 const char *errors,
7602 _PyUnicodeWriter *writer)
7603{
7604 const char *starts = s;
7605 const char *e;
7606 Py_ssize_t startinpos, endinpos;
7607 PyObject *errorHandler = NULL, *exc = NULL;
7608 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007609 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007610
7611 e = s + size;
7612
7613 while (s < e) {
7614 ch = *s;
7615
7616 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7617 key = PyLong_FromLong((long)ch);
7618 if (key == NULL)
7619 goto onError;
7620
7621 item = PyObject_GetItem(mapping, key);
7622 Py_DECREF(key);
7623 if (item == NULL) {
7624 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7625 /* No mapping found means: mapping is undefined. */
7626 PyErr_Clear();
7627 goto Undefined;
7628 } else
7629 goto onError;
7630 }
7631
7632 /* Apply mapping */
7633 if (item == Py_None)
7634 goto Undefined;
7635 if (PyLong_Check(item)) {
7636 long value = PyLong_AS_LONG(item);
7637 if (value == 0xFFFE)
7638 goto Undefined;
7639 if (value < 0 || value > MAX_UNICODE) {
7640 PyErr_Format(PyExc_TypeError,
7641 "character mapping must be in range(0x%lx)",
7642 (unsigned long)MAX_UNICODE + 1);
7643 goto onError;
7644 }
7645
7646 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7647 goto onError;
7648 }
7649 else if (PyUnicode_Check(item)) {
7650 if (PyUnicode_READY(item) == -1)
7651 goto onError;
7652 if (PyUnicode_GET_LENGTH(item) == 1) {
7653 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7654 if (value == 0xFFFE)
7655 goto Undefined;
7656 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7657 goto onError;
7658 }
7659 else {
7660 writer->overallocate = 1;
7661 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7662 goto onError;
7663 }
7664 }
7665 else {
7666 /* wrong return value */
7667 PyErr_SetString(PyExc_TypeError,
7668 "character mapping must return integer, None or str");
7669 goto onError;
7670 }
7671 Py_CLEAR(item);
7672 ++s;
7673 continue;
7674
7675Undefined:
7676 /* undefined mapping */
7677 Py_CLEAR(item);
7678 startinpos = s-starts;
7679 endinpos = startinpos+1;
7680 if (unicode_decode_call_errorhandler_writer(
7681 errors, &errorHandler,
7682 "charmap", "character maps to <undefined>",
7683 &starts, &e, &startinpos, &endinpos, &exc, &s,
7684 writer)) {
7685 goto onError;
7686 }
7687 }
7688 Py_XDECREF(errorHandler);
7689 Py_XDECREF(exc);
7690 return 0;
7691
7692onError:
7693 Py_XDECREF(item);
7694 Py_XDECREF(errorHandler);
7695 Py_XDECREF(exc);
7696 return -1;
7697}
7698
Alexander Belopolsky40018472011-02-26 01:02:56 +00007699PyObject *
7700PyUnicode_DecodeCharmap(const char *s,
7701 Py_ssize_t size,
7702 PyObject *mapping,
7703 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007704{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007705 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007706
Guido van Rossumd57fd912000-03-10 22:53:23 +00007707 /* Default to Latin-1 */
7708 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007709 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007710
Guido van Rossumd57fd912000-03-10 22:53:23 +00007711 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007712 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007713 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007714 writer.min_length = size;
7715 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007716 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007717
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007718 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007719 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7720 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007721 }
7722 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007723 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7724 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007725 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007726 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007727
Benjamin Peterson29060642009-01-31 22:14:21 +00007728 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007729 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007730 return NULL;
7731}
7732
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007733/* Charmap encoding: the lookup table */
7734
Alexander Belopolsky40018472011-02-26 01:02:56 +00007735struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007736 PyObject_HEAD
7737 unsigned char level1[32];
7738 int count2, count3;
7739 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007740};
7741
7742static PyObject*
7743encoding_map_size(PyObject *obj, PyObject* args)
7744{
7745 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007746 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007747 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007748}
7749
7750static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007751 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007752 PyDoc_STR("Return the size (in bytes) of this object") },
7753 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007754};
7755
7756static void
7757encoding_map_dealloc(PyObject* o)
7758{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007759 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007760}
7761
7762static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007763 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007764 "EncodingMap", /*tp_name*/
7765 sizeof(struct encoding_map), /*tp_basicsize*/
7766 0, /*tp_itemsize*/
7767 /* methods */
7768 encoding_map_dealloc, /*tp_dealloc*/
7769 0, /*tp_print*/
7770 0, /*tp_getattr*/
7771 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007772 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007773 0, /*tp_repr*/
7774 0, /*tp_as_number*/
7775 0, /*tp_as_sequence*/
7776 0, /*tp_as_mapping*/
7777 0, /*tp_hash*/
7778 0, /*tp_call*/
7779 0, /*tp_str*/
7780 0, /*tp_getattro*/
7781 0, /*tp_setattro*/
7782 0, /*tp_as_buffer*/
7783 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7784 0, /*tp_doc*/
7785 0, /*tp_traverse*/
7786 0, /*tp_clear*/
7787 0, /*tp_richcompare*/
7788 0, /*tp_weaklistoffset*/
7789 0, /*tp_iter*/
7790 0, /*tp_iternext*/
7791 encoding_map_methods, /*tp_methods*/
7792 0, /*tp_members*/
7793 0, /*tp_getset*/
7794 0, /*tp_base*/
7795 0, /*tp_dict*/
7796 0, /*tp_descr_get*/
7797 0, /*tp_descr_set*/
7798 0, /*tp_dictoffset*/
7799 0, /*tp_init*/
7800 0, /*tp_alloc*/
7801 0, /*tp_new*/
7802 0, /*tp_free*/
7803 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007804};
7805
7806PyObject*
7807PyUnicode_BuildEncodingMap(PyObject* string)
7808{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007809 PyObject *result;
7810 struct encoding_map *mresult;
7811 int i;
7812 int need_dict = 0;
7813 unsigned char level1[32];
7814 unsigned char level2[512];
7815 unsigned char *mlevel1, *mlevel2, *mlevel3;
7816 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007817 int kind;
7818 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007819 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007820 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007821
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007822 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007823 PyErr_BadArgument();
7824 return NULL;
7825 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007826 kind = PyUnicode_KIND(string);
7827 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007828 length = PyUnicode_GET_LENGTH(string);
7829 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007830 memset(level1, 0xFF, sizeof level1);
7831 memset(level2, 0xFF, sizeof level2);
7832
7833 /* If there isn't a one-to-one mapping of NULL to \0,
7834 or if there are non-BMP characters, we need to use
7835 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007836 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007837 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007838 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007839 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007840 ch = PyUnicode_READ(kind, data, i);
7841 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007842 need_dict = 1;
7843 break;
7844 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007845 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007846 /* unmapped character */
7847 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007848 l1 = ch >> 11;
7849 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007850 if (level1[l1] == 0xFF)
7851 level1[l1] = count2++;
7852 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007853 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007854 }
7855
7856 if (count2 >= 0xFF || count3 >= 0xFF)
7857 need_dict = 1;
7858
7859 if (need_dict) {
7860 PyObject *result = PyDict_New();
7861 PyObject *key, *value;
7862 if (!result)
7863 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007864 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007865 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007866 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007867 if (!key || !value)
7868 goto failed1;
7869 if (PyDict_SetItem(result, key, value) == -1)
7870 goto failed1;
7871 Py_DECREF(key);
7872 Py_DECREF(value);
7873 }
7874 return result;
7875 failed1:
7876 Py_XDECREF(key);
7877 Py_XDECREF(value);
7878 Py_DECREF(result);
7879 return NULL;
7880 }
7881
7882 /* Create a three-level trie */
7883 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7884 16*count2 + 128*count3 - 1);
7885 if (!result)
7886 return PyErr_NoMemory();
7887 PyObject_Init(result, &EncodingMapType);
7888 mresult = (struct encoding_map*)result;
7889 mresult->count2 = count2;
7890 mresult->count3 = count3;
7891 mlevel1 = mresult->level1;
7892 mlevel2 = mresult->level23;
7893 mlevel3 = mresult->level23 + 16*count2;
7894 memcpy(mlevel1, level1, 32);
7895 memset(mlevel2, 0xFF, 16*count2);
7896 memset(mlevel3, 0, 128*count3);
7897 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007898 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007899 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007900 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7901 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007902 /* unmapped character */
7903 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007904 o1 = ch>>11;
7905 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007906 i2 = 16*mlevel1[o1] + o2;
7907 if (mlevel2[i2] == 0xFF)
7908 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007909 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007910 i3 = 128*mlevel2[i2] + o3;
7911 mlevel3[i3] = i;
7912 }
7913 return result;
7914}
7915
7916static int
Victor Stinner22168992011-11-20 17:09:18 +01007917encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007918{
7919 struct encoding_map *map = (struct encoding_map*)mapping;
7920 int l1 = c>>11;
7921 int l2 = (c>>7) & 0xF;
7922 int l3 = c & 0x7F;
7923 int i;
7924
Victor Stinner22168992011-11-20 17:09:18 +01007925 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007926 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007927 if (c == 0)
7928 return 0;
7929 /* level 1*/
7930 i = map->level1[l1];
7931 if (i == 0xFF) {
7932 return -1;
7933 }
7934 /* level 2*/
7935 i = map->level23[16*i+l2];
7936 if (i == 0xFF) {
7937 return -1;
7938 }
7939 /* level 3 */
7940 i = map->level23[16*map->count2 + 128*i + l3];
7941 if (i == 0) {
7942 return -1;
7943 }
7944 return i;
7945}
7946
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007947/* Lookup the character ch in the mapping. If the character
7948 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007949 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007950static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007951charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007952{
Christian Heimes217cfd12007-12-02 14:31:20 +00007953 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007954 PyObject *x;
7955
7956 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007957 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007958 x = PyObject_GetItem(mapping, w);
7959 Py_DECREF(w);
7960 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007961 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7962 /* No mapping found means: mapping is undefined. */
7963 PyErr_Clear();
7964 x = Py_None;
7965 Py_INCREF(x);
7966 return x;
7967 } else
7968 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007969 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007970 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007971 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007972 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007973 long value = PyLong_AS_LONG(x);
7974 if (value < 0 || value > 255) {
7975 PyErr_SetString(PyExc_TypeError,
7976 "character mapping must be in range(256)");
7977 Py_DECREF(x);
7978 return NULL;
7979 }
7980 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007981 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007982 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007983 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007984 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007985 /* wrong return value */
7986 PyErr_Format(PyExc_TypeError,
7987 "character mapping must return integer, bytes or None, not %.400s",
7988 x->ob_type->tp_name);
7989 Py_DECREF(x);
7990 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007991 }
7992}
7993
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007994static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007995charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007996{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007997 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7998 /* exponentially overallocate to minimize reallocations */
7999 if (requiredsize < 2*outsize)
8000 requiredsize = 2*outsize;
8001 if (_PyBytes_Resize(outobj, requiredsize))
8002 return -1;
8003 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008004}
8005
Benjamin Peterson14339b62009-01-31 16:36:08 +00008006typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008007 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008008} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008009/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008010 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008011 space is available. Return a new reference to the object that
8012 was put in the output buffer, or Py_None, if the mapping was undefined
8013 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008014 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008015static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008016charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008017 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008018{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008019 PyObject *rep;
8020 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008021 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008022
Christian Heimes90aa7642007-12-19 02:45:37 +00008023 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008024 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008025 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008026 if (res == -1)
8027 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008028 if (outsize<requiredsize)
8029 if (charmapencode_resize(outobj, outpos, requiredsize))
8030 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008031 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008032 outstart[(*outpos)++] = (char)res;
8033 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008034 }
8035
8036 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008037 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008038 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008039 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008040 Py_DECREF(rep);
8041 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008042 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008043 if (PyLong_Check(rep)) {
8044 Py_ssize_t requiredsize = *outpos+1;
8045 if (outsize<requiredsize)
8046 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8047 Py_DECREF(rep);
8048 return enc_EXCEPTION;
8049 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008050 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008051 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008052 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008053 else {
8054 const char *repchars = PyBytes_AS_STRING(rep);
8055 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8056 Py_ssize_t requiredsize = *outpos+repsize;
8057 if (outsize<requiredsize)
8058 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8059 Py_DECREF(rep);
8060 return enc_EXCEPTION;
8061 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008062 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008063 memcpy(outstart + *outpos, repchars, repsize);
8064 *outpos += repsize;
8065 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008066 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008067 Py_DECREF(rep);
8068 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008069}
8070
8071/* handle an error in PyUnicode_EncodeCharmap
8072 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008073static int
8074charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008075 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008076 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008077 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008078 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008079{
8080 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008081 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008082 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008083 enum PyUnicode_Kind kind;
8084 void *data;
8085 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008086 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008087 Py_ssize_t collstartpos = *inpos;
8088 Py_ssize_t collendpos = *inpos+1;
8089 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008090 char *encoding = "charmap";
8091 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008092 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008093 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008094 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008095
Benjamin Petersonbac79492012-01-14 13:34:47 -05008096 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008097 return -1;
8098 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008099 /* find all unencodable characters */
8100 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008101 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008102 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008103 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008104 val = encoding_map_lookup(ch, mapping);
8105 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008106 break;
8107 ++collendpos;
8108 continue;
8109 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008110
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008111 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8112 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008113 if (rep==NULL)
8114 return -1;
8115 else if (rep!=Py_None) {
8116 Py_DECREF(rep);
8117 break;
8118 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008119 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008120 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008121 }
8122 /* cache callback name lookup
8123 * (if not done yet, i.e. it's the first error) */
8124 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008125 if ((errors==NULL) || (!strcmp(errors, "strict")))
8126 *known_errorHandler = 1;
8127 else if (!strcmp(errors, "replace"))
8128 *known_errorHandler = 2;
8129 else if (!strcmp(errors, "ignore"))
8130 *known_errorHandler = 3;
8131 else if (!strcmp(errors, "xmlcharrefreplace"))
8132 *known_errorHandler = 4;
8133 else
8134 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008135 }
8136 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008137 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008138 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008139 return -1;
8140 case 2: /* replace */
8141 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008142 x = charmapencode_output('?', mapping, res, respos);
8143 if (x==enc_EXCEPTION) {
8144 return -1;
8145 }
8146 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008147 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008148 return -1;
8149 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008150 }
8151 /* fall through */
8152 case 3: /* ignore */
8153 *inpos = collendpos;
8154 break;
8155 case 4: /* xmlcharrefreplace */
8156 /* generate replacement (temporarily (mis)uses p) */
8157 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008158 char buffer[2+29+1+1];
8159 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008160 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008161 for (cp = buffer; *cp; ++cp) {
8162 x = charmapencode_output(*cp, mapping, res, respos);
8163 if (x==enc_EXCEPTION)
8164 return -1;
8165 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008166 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008167 return -1;
8168 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008169 }
8170 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008171 *inpos = collendpos;
8172 break;
8173 default:
8174 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008175 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008176 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008177 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008178 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008179 if (PyBytes_Check(repunicode)) {
8180 /* Directly copy bytes result to output. */
8181 Py_ssize_t outsize = PyBytes_Size(*res);
8182 Py_ssize_t requiredsize;
8183 repsize = PyBytes_Size(repunicode);
8184 requiredsize = *respos + repsize;
8185 if (requiredsize > outsize)
8186 /* Make room for all additional bytes. */
8187 if (charmapencode_resize(res, respos, requiredsize)) {
8188 Py_DECREF(repunicode);
8189 return -1;
8190 }
8191 memcpy(PyBytes_AsString(*res) + *respos,
8192 PyBytes_AsString(repunicode), repsize);
8193 *respos += repsize;
8194 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008195 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008196 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008197 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008198 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008199 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008200 Py_DECREF(repunicode);
8201 return -1;
8202 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008203 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008204 data = PyUnicode_DATA(repunicode);
8205 kind = PyUnicode_KIND(repunicode);
8206 for (index = 0; index < repsize; index++) {
8207 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8208 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008209 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008210 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008211 return -1;
8212 }
8213 else if (x==enc_FAILED) {
8214 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008215 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008216 return -1;
8217 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008218 }
8219 *inpos = newpos;
8220 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008221 }
8222 return 0;
8223}
8224
Alexander Belopolsky40018472011-02-26 01:02:56 +00008225PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008226_PyUnicode_EncodeCharmap(PyObject *unicode,
8227 PyObject *mapping,
8228 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008229{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008230 /* output object */
8231 PyObject *res = NULL;
8232 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008233 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008234 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008235 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008236 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008237 PyObject *errorHandler = NULL;
8238 PyObject *exc = NULL;
8239 /* the following variable is used for caching string comparisons
8240 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8241 * 3=ignore, 4=xmlcharrefreplace */
8242 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008243 void *data;
8244 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008245
Benjamin Petersonbac79492012-01-14 13:34:47 -05008246 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008247 return NULL;
8248 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008249 data = PyUnicode_DATA(unicode);
8250 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008251
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252 /* Default to Latin-1 */
8253 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008254 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008255
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008256 /* allocate enough for a simple encoding without
8257 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008258 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008259 if (res == NULL)
8260 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008261 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008262 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008263
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008264 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008265 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008266 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008267 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008268 if (x==enc_EXCEPTION) /* error */
8269 goto onError;
8270 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008271 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008272 &exc,
8273 &known_errorHandler, &errorHandler, errors,
8274 &res, &respos)) {
8275 goto onError;
8276 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008277 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008278 else
8279 /* done with this character => adjust input position */
8280 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008281 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008282
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008283 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008284 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008285 if (_PyBytes_Resize(&res, respos) < 0)
8286 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008287
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008288 Py_XDECREF(exc);
8289 Py_XDECREF(errorHandler);
8290 return res;
8291
Benjamin Peterson29060642009-01-31 22:14:21 +00008292 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008293 Py_XDECREF(res);
8294 Py_XDECREF(exc);
8295 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296 return NULL;
8297}
8298
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008299/* Deprecated */
8300PyObject *
8301PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8302 Py_ssize_t size,
8303 PyObject *mapping,
8304 const char *errors)
8305{
8306 PyObject *result;
8307 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8308 if (unicode == NULL)
8309 return NULL;
8310 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8311 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008312 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008313}
8314
Alexander Belopolsky40018472011-02-26 01:02:56 +00008315PyObject *
8316PyUnicode_AsCharmapString(PyObject *unicode,
8317 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008318{
8319 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008320 PyErr_BadArgument();
8321 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008323 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008324}
8325
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008326/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008327static void
8328make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008329 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008330 Py_ssize_t startpos, Py_ssize_t endpos,
8331 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008332{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008333 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008334 *exceptionObject = _PyUnicodeTranslateError_Create(
8335 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008336 }
8337 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008338 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8339 goto onError;
8340 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8341 goto onError;
8342 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8343 goto onError;
8344 return;
8345 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008346 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008347 }
8348}
8349
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008350/* error handling callback helper:
8351 build arguments, call the callback and check the arguments,
8352 put the result into newpos and return the replacement string, which
8353 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008354static PyObject *
8355unicode_translate_call_errorhandler(const char *errors,
8356 PyObject **errorHandler,
8357 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008358 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008359 Py_ssize_t startpos, Py_ssize_t endpos,
8360 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008361{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008362 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008363
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008364 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008365 PyObject *restuple;
8366 PyObject *resunicode;
8367
8368 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008369 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008370 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008372 }
8373
8374 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008375 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008376 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008378
8379 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008381 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008383 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008384 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008385 Py_DECREF(restuple);
8386 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008387 }
8388 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 &resunicode, &i_newpos)) {
8390 Py_DECREF(restuple);
8391 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008392 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008393 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008394 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008395 else
8396 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008397 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008398 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008399 Py_DECREF(restuple);
8400 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008401 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008402 Py_INCREF(resunicode);
8403 Py_DECREF(restuple);
8404 return resunicode;
8405}
8406
8407/* Lookup the character ch in the mapping and put the result in result,
8408 which must be decrefed by the caller.
8409 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008410static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008411charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008412{
Christian Heimes217cfd12007-12-02 14:31:20 +00008413 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008414 PyObject *x;
8415
8416 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008417 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008418 x = PyObject_GetItem(mapping, w);
8419 Py_DECREF(w);
8420 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8422 /* No mapping found means: use 1:1 mapping. */
8423 PyErr_Clear();
8424 *result = NULL;
8425 return 0;
8426 } else
8427 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008428 }
8429 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008430 *result = x;
8431 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008432 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008433 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008434 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008435 if (value < 0 || value > MAX_UNICODE) {
8436 PyErr_Format(PyExc_ValueError,
8437 "character mapping must be in range(0x%x)",
8438 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008439 Py_DECREF(x);
8440 return -1;
8441 }
8442 *result = x;
8443 return 0;
8444 }
8445 else if (PyUnicode_Check(x)) {
8446 *result = x;
8447 return 0;
8448 }
8449 else {
8450 /* wrong return value */
8451 PyErr_SetString(PyExc_TypeError,
8452 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008453 Py_DECREF(x);
8454 return -1;
8455 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008456}
Victor Stinner1194ea02014-04-04 19:37:40 +02008457
8458/* lookup the character, write the result into the writer.
8459 Return 1 if the result was written into the writer, return 0 if the mapping
8460 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008461static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008462charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8463 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008464{
Victor Stinner1194ea02014-04-04 19:37:40 +02008465 PyObject *item;
8466
8467 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008468 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008469
8470 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008471 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008472 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008473 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008474 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008475 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008476 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008477
8478 if (item == Py_None) {
8479 Py_DECREF(item);
8480 return 0;
8481 }
8482
8483 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008484 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8485 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8486 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008487 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8488 Py_DECREF(item);
8489 return -1;
8490 }
8491 Py_DECREF(item);
8492 return 1;
8493 }
8494
8495 if (!PyUnicode_Check(item)) {
8496 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008497 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008498 }
8499
8500 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8501 Py_DECREF(item);
8502 return -1;
8503 }
8504
8505 Py_DECREF(item);
8506 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008507}
8508
Victor Stinner89a76ab2014-04-05 11:44:04 +02008509static int
8510unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8511 Py_UCS1 *translate)
8512{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008513 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008514 int ret = 0;
8515
Victor Stinner89a76ab2014-04-05 11:44:04 +02008516 if (charmaptranslate_lookup(ch, mapping, &item)) {
8517 return -1;
8518 }
8519
8520 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008521 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008522 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008523 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008524 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008525 /* not found => default to 1:1 mapping */
8526 translate[ch] = ch;
8527 return 1;
8528 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008529 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008530 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008531 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8532 used it */
8533 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008534 /* invalid character or character outside ASCII:
8535 skip the fast translate */
8536 goto exit;
8537 }
8538 translate[ch] = (Py_UCS1)replace;
8539 }
8540 else if (PyUnicode_Check(item)) {
8541 Py_UCS4 replace;
8542
8543 if (PyUnicode_READY(item) == -1) {
8544 Py_DECREF(item);
8545 return -1;
8546 }
8547 if (PyUnicode_GET_LENGTH(item) != 1)
8548 goto exit;
8549
8550 replace = PyUnicode_READ_CHAR(item, 0);
8551 if (replace > 127)
8552 goto exit;
8553 translate[ch] = (Py_UCS1)replace;
8554 }
8555 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008556 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008557 goto exit;
8558 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008559 ret = 1;
8560
Benjamin Peterson1365de72014-04-07 20:15:41 -04008561 exit:
8562 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008563 return ret;
8564}
8565
8566/* Fast path for ascii => ascii translation. Return 1 if the whole string
8567 was translated into writer, return 0 if the input string was partially
8568 translated into writer, raise an exception and return -1 on error. */
8569static int
8570unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner872b2912014-04-05 14:27:07 +02008571 _PyUnicodeWriter *writer, int ignore)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008572{
Victor Stinner872b2912014-04-05 14:27:07 +02008573 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008574 Py_ssize_t len;
8575 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008576 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008577
8578 if (PyUnicode_READY(input) == -1)
8579 return -1;
8580 if (!PyUnicode_IS_ASCII(input))
8581 return 0;
8582 len = PyUnicode_GET_LENGTH(input);
8583
Victor Stinner872b2912014-04-05 14:27:07 +02008584 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008585
8586 in = PyUnicode_1BYTE_DATA(input);
8587 end = in + len;
8588
8589 assert(PyUnicode_IS_ASCII(writer->buffer));
8590 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8591 out = PyUnicode_1BYTE_DATA(writer->buffer);
8592
Victor Stinner872b2912014-04-05 14:27:07 +02008593 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008594 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008595 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008596 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008597 int translate = unicode_fast_translate_lookup(mapping, ch,
8598 ascii_table);
8599 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008600 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008601 if (translate == 0)
8602 goto exit;
8603 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008604 }
Victor Stinner872b2912014-04-05 14:27:07 +02008605 if (ch2 == 0xfe) {
8606 if (ignore)
8607 continue;
8608 goto exit;
8609 }
8610 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008611 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008612 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008613 }
Victor Stinner872b2912014-04-05 14:27:07 +02008614 res = 1;
8615
8616exit:
8617 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8618 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008619}
8620
Alexander Belopolsky40018472011-02-26 01:02:56 +00008621PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008622_PyUnicode_TranslateCharmap(PyObject *input,
8623 PyObject *mapping,
8624 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008626 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008627 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008628 Py_ssize_t size, i;
8629 int kind;
8630 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008631 _PyUnicodeWriter writer;
8632 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008633 char *reason = "character maps to <undefined>";
8634 PyObject *errorHandler = NULL;
8635 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008636 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008637 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008638
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008640 PyErr_BadArgument();
8641 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008642 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008643
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008644 if (PyUnicode_READY(input) == -1)
8645 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008646 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008647 kind = PyUnicode_KIND(input);
8648 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008649
8650 if (size == 0) {
8651 Py_INCREF(input);
8652 return input;
8653 }
8654
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008655 /* allocate enough for a simple 1:1 translation without
8656 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008657 _PyUnicodeWriter_Init(&writer);
8658 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008659 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008660
Victor Stinner872b2912014-04-05 14:27:07 +02008661 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8662
8663 res = unicode_fast_translate(input, mapping, &writer, ignore);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008664 if (res < 0) {
8665 _PyUnicodeWriter_Dealloc(&writer);
8666 return NULL;
8667 }
8668 if (res == 1)
8669 return _PyUnicodeWriter_Finish(&writer);
8670
Victor Stinner89a76ab2014-04-05 11:44:04 +02008671 i = writer.pos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008672 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008674 int translate;
8675 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8676 Py_ssize_t newpos;
8677 /* startpos for collecting untranslatable chars */
8678 Py_ssize_t collstart;
8679 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008680 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008681
Victor Stinner1194ea02014-04-04 19:37:40 +02008682 ch = PyUnicode_READ(kind, data, i);
8683 translate = charmaptranslate_output(ch, mapping, &writer);
8684 if (translate < 0)
8685 goto onError;
8686
8687 if (translate != 0) {
8688 /* it worked => adjust input pointer */
8689 ++i;
8690 continue;
8691 }
8692
8693 /* untranslatable character */
8694 collstart = i;
8695 collend = i+1;
8696
8697 /* find all untranslatable characters */
8698 while (collend < size) {
8699 PyObject *x;
8700 ch = PyUnicode_READ(kind, data, collend);
8701 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008702 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008703 Py_XDECREF(x);
8704 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008705 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008706 ++collend;
8707 }
8708
8709 if (ignore) {
8710 i = collend;
8711 }
8712 else {
8713 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8714 reason, input, &exc,
8715 collstart, collend, &newpos);
8716 if (repunicode == NULL)
8717 goto onError;
8718 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008719 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008720 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008721 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008722 Py_DECREF(repunicode);
8723 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008724 }
8725 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008726 Py_XDECREF(exc);
8727 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02008728 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008729
Benjamin Peterson29060642009-01-31 22:14:21 +00008730 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02008731 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008732 Py_XDECREF(exc);
8733 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008734 return NULL;
8735}
8736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008737/* Deprecated. Use PyUnicode_Translate instead. */
8738PyObject *
8739PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8740 Py_ssize_t size,
8741 PyObject *mapping,
8742 const char *errors)
8743{
Christian Heimes5f520f42012-09-11 14:03:25 +02008744 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008745 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8746 if (!unicode)
8747 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008748 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8749 Py_DECREF(unicode);
8750 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008751}
8752
Alexander Belopolsky40018472011-02-26 01:02:56 +00008753PyObject *
8754PyUnicode_Translate(PyObject *str,
8755 PyObject *mapping,
8756 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008757{
8758 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008759
Guido van Rossumd57fd912000-03-10 22:53:23 +00008760 str = PyUnicode_FromObject(str);
8761 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008762 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008763 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008764 Py_DECREF(str);
8765 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008766}
Tim Petersced69f82003-09-16 20:30:58 +00008767
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008768static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008769fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008770{
8771 /* No need to call PyUnicode_READY(self) because this function is only
8772 called as a callback from fixup() which does it already. */
8773 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8774 const int kind = PyUnicode_KIND(self);
8775 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008776 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008777 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008778 Py_ssize_t i;
8779
8780 for (i = 0; i < len; ++i) {
8781 ch = PyUnicode_READ(kind, data, i);
8782 fixed = 0;
8783 if (ch > 127) {
8784 if (Py_UNICODE_ISSPACE(ch))
8785 fixed = ' ';
8786 else {
8787 const int decimal = Py_UNICODE_TODECIMAL(ch);
8788 if (decimal >= 0)
8789 fixed = '0' + decimal;
8790 }
8791 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008792 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008793 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008794 PyUnicode_WRITE(kind, data, i, fixed);
8795 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008796 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008797 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008798 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008799 }
8800
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008801 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008802}
8803
8804PyObject *
8805_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8806{
8807 if (!PyUnicode_Check(unicode)) {
8808 PyErr_BadInternalCall();
8809 return NULL;
8810 }
8811 if (PyUnicode_READY(unicode) == -1)
8812 return NULL;
8813 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8814 /* If the string is already ASCII, just return the same string */
8815 Py_INCREF(unicode);
8816 return unicode;
8817 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008818 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008819}
8820
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008821PyObject *
8822PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8823 Py_ssize_t length)
8824{
Victor Stinnerf0124502011-11-21 23:12:56 +01008825 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008826 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008827 Py_UCS4 maxchar;
8828 enum PyUnicode_Kind kind;
8829 void *data;
8830
Victor Stinner99d7ad02012-02-22 13:37:39 +01008831 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008832 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02008833 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008834 if (ch > 127) {
8835 int decimal = Py_UNICODE_TODECIMAL(ch);
8836 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008837 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008838 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008839 }
8840 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008841
8842 /* Copy to a new string */
8843 decimal = PyUnicode_New(length, maxchar);
8844 if (decimal == NULL)
8845 return decimal;
8846 kind = PyUnicode_KIND(decimal);
8847 data = PyUnicode_DATA(decimal);
8848 /* Iterate over code points */
8849 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02008850 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01008851 if (ch > 127) {
8852 int decimal = Py_UNICODE_TODECIMAL(ch);
8853 if (decimal >= 0)
8854 ch = '0' + decimal;
8855 }
8856 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008857 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008858 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008859}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008860/* --- Decimal Encoder ---------------------------------------------------- */
8861
Alexander Belopolsky40018472011-02-26 01:02:56 +00008862int
8863PyUnicode_EncodeDecimal(Py_UNICODE *s,
8864 Py_ssize_t length,
8865 char *output,
8866 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008867{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008868 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008869 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008870 enum PyUnicode_Kind kind;
8871 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008872
8873 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008874 PyErr_BadArgument();
8875 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008876 }
8877
Victor Stinner42bf7752011-11-21 22:52:58 +01008878 unicode = PyUnicode_FromUnicode(s, length);
8879 if (unicode == NULL)
8880 return -1;
8881
Benjamin Petersonbac79492012-01-14 13:34:47 -05008882 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008883 Py_DECREF(unicode);
8884 return -1;
8885 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008886 kind = PyUnicode_KIND(unicode);
8887 data = PyUnicode_DATA(unicode);
8888
Victor Stinnerb84d7232011-11-22 01:50:07 +01008889 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008890 PyObject *exc;
8891 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008892 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008893 Py_ssize_t startpos;
8894
8895 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008896
Benjamin Peterson29060642009-01-31 22:14:21 +00008897 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008898 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008899 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008900 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008901 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008902 decimal = Py_UNICODE_TODECIMAL(ch);
8903 if (decimal >= 0) {
8904 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008905 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008906 continue;
8907 }
8908 if (0 < ch && ch < 256) {
8909 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008910 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008911 continue;
8912 }
Victor Stinner6345be92011-11-25 20:09:01 +01008913
Victor Stinner42bf7752011-11-21 22:52:58 +01008914 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008915 exc = NULL;
8916 raise_encode_exception(&exc, "decimal", unicode,
8917 startpos, startpos+1,
8918 "invalid decimal Unicode string");
8919 Py_XDECREF(exc);
8920 Py_DECREF(unicode);
8921 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008922 }
8923 /* 0-terminate the output string */
8924 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008925 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008926 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008927}
8928
Guido van Rossumd57fd912000-03-10 22:53:23 +00008929/* --- Helpers ------------------------------------------------------------ */
8930
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008931/* helper macro to fixup start/end slice values */
8932#define ADJUST_INDICES(start, end, len) \
8933 if (end > len) \
8934 end = len; \
8935 else if (end < 0) { \
8936 end += len; \
8937 if (end < 0) \
8938 end = 0; \
8939 } \
8940 if (start < 0) { \
8941 start += len; \
8942 if (start < 0) \
8943 start = 0; \
8944 }
8945
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008946static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008947any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008948 Py_ssize_t start,
8949 Py_ssize_t end)
8950{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008951 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008952 void *buf1, *buf2;
8953 Py_ssize_t len1, len2, result;
8954
8955 kind1 = PyUnicode_KIND(s1);
8956 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008957 if (kind1 < kind2)
8958 return -1;
8959
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008960 len1 = PyUnicode_GET_LENGTH(s1);
8961 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008962 ADJUST_INDICES(start, end, len1);
8963 if (end - start < len2)
8964 return -1;
8965
8966 buf1 = PyUnicode_DATA(s1);
8967 buf2 = PyUnicode_DATA(s2);
8968 if (len2 == 1) {
8969 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
8970 result = findchar((const char *)buf1 + kind1*start,
8971 kind1, end - start, ch, direction);
8972 if (result == -1)
8973 return -1;
8974 else
8975 return start + result;
8976 }
8977
8978 if (kind2 != kind1) {
8979 buf2 = _PyUnicode_AsKind(s2, kind1);
8980 if (!buf2)
8981 return -2;
8982 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008983
Victor Stinner794d5672011-10-10 03:21:36 +02008984 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008985 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02008986 case PyUnicode_1BYTE_KIND:
8987 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8988 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8989 else
8990 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8991 break;
8992 case PyUnicode_2BYTE_KIND:
8993 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8994 break;
8995 case PyUnicode_4BYTE_KIND:
8996 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8997 break;
8998 default:
8999 assert(0); result = -2;
9000 }
9001 }
9002 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009003 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009004 case PyUnicode_1BYTE_KIND:
9005 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9006 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9007 else
9008 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9009 break;
9010 case PyUnicode_2BYTE_KIND:
9011 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9012 break;
9013 case PyUnicode_4BYTE_KIND:
9014 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9015 break;
9016 default:
9017 assert(0); result = -2;
9018 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009019 }
9020
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009021 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009022 PyMem_Free(buf2);
9023
9024 return result;
9025}
9026
9027Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009028_PyUnicode_InsertThousandsGrouping(
9029 PyObject *unicode, Py_ssize_t index,
9030 Py_ssize_t n_buffer,
9031 void *digits, Py_ssize_t n_digits,
9032 Py_ssize_t min_width,
9033 const char *grouping, PyObject *thousands_sep,
9034 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009035{
Victor Stinner41a863c2012-02-24 00:37:51 +01009036 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009037 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009038 Py_ssize_t thousands_sep_len;
9039 Py_ssize_t len;
9040
9041 if (unicode != NULL) {
9042 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009043 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009044 }
9045 else {
9046 kind = PyUnicode_1BYTE_KIND;
9047 data = NULL;
9048 }
9049 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9050 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9051 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9052 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009053 if (thousands_sep_kind < kind) {
9054 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9055 if (!thousands_sep_data)
9056 return -1;
9057 }
9058 else {
9059 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9060 if (!data)
9061 return -1;
9062 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009063 }
9064
Benjamin Petersonead6b532011-12-20 17:23:42 -06009065 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009066 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009067 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009068 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009069 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009070 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009071 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009072 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009073 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009074 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009075 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009076 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009077 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009078 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009079 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009080 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009081 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009082 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009083 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009084 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009085 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009086 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009087 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009088 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009089 break;
9090 default:
9091 assert(0);
9092 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009093 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009094 if (unicode != NULL && thousands_sep_kind != kind) {
9095 if (thousands_sep_kind < kind)
9096 PyMem_Free(thousands_sep_data);
9097 else
9098 PyMem_Free(data);
9099 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009100 if (unicode == NULL) {
9101 *maxchar = 127;
9102 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009103 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009104 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009105 }
9106 }
9107 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009108}
9109
9110
Alexander Belopolsky40018472011-02-26 01:02:56 +00009111Py_ssize_t
9112PyUnicode_Count(PyObject *str,
9113 PyObject *substr,
9114 Py_ssize_t start,
9115 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009116{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009117 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009118 PyObject* str_obj;
9119 PyObject* sub_obj;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009120 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009121 void *buf1 = NULL, *buf2 = NULL;
9122 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009123
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009124 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009125 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009126 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009127 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009128 if (!sub_obj) {
9129 Py_DECREF(str_obj);
9130 return -1;
9131 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009132 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009133 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009134 Py_DECREF(str_obj);
9135 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009136 }
Tim Petersced69f82003-09-16 20:30:58 +00009137
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009138 kind1 = PyUnicode_KIND(str_obj);
9139 kind2 = PyUnicode_KIND(sub_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009140 if (kind1 < kind2) {
9141 Py_DECREF(sub_obj);
9142 Py_DECREF(str_obj);
9143 return 0;
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009144 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009145
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009146 len1 = PyUnicode_GET_LENGTH(str_obj);
9147 len2 = PyUnicode_GET_LENGTH(sub_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009148 ADJUST_INDICES(start, end, len1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009149 if (end - start < len2) {
9150 Py_DECREF(sub_obj);
9151 Py_DECREF(str_obj);
9152 return 0;
9153 }
9154
9155 buf1 = PyUnicode_DATA(str_obj);
9156 buf2 = PyUnicode_DATA(sub_obj);
9157 if (kind2 != kind1) {
9158 buf2 = _PyUnicode_AsKind(sub_obj, kind1);
9159 if (!buf2)
9160 goto onError;
9161 }
9162
9163 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009164 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009165 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9166 result = asciilib_count(
9167 ((Py_UCS1*)buf1) + start, end - start,
9168 buf2, len2, PY_SSIZE_T_MAX
9169 );
9170 else
9171 result = ucs1lib_count(
9172 ((Py_UCS1*)buf1) + start, end - start,
9173 buf2, len2, PY_SSIZE_T_MAX
9174 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009175 break;
9176 case PyUnicode_2BYTE_KIND:
9177 result = ucs2lib_count(
9178 ((Py_UCS2*)buf1) + start, end - start,
9179 buf2, len2, PY_SSIZE_T_MAX
9180 );
9181 break;
9182 case PyUnicode_4BYTE_KIND:
9183 result = ucs4lib_count(
9184 ((Py_UCS4*)buf1) + start, end - start,
9185 buf2, len2, PY_SSIZE_T_MAX
9186 );
9187 break;
9188 default:
9189 assert(0); result = 0;
9190 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009191
9192 Py_DECREF(sub_obj);
9193 Py_DECREF(str_obj);
9194
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009195 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009196 PyMem_Free(buf2);
9197
Guido van Rossumd57fd912000-03-10 22:53:23 +00009198 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009199 onError:
9200 Py_DECREF(sub_obj);
9201 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009202 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009203 PyMem_Free(buf2);
9204 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009205}
9206
Alexander Belopolsky40018472011-02-26 01:02:56 +00009207Py_ssize_t
9208PyUnicode_Find(PyObject *str,
9209 PyObject *sub,
9210 Py_ssize_t start,
9211 Py_ssize_t end,
9212 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009213{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009214 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009215
Guido van Rossumd57fd912000-03-10 22:53:23 +00009216 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009217 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009218 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009219 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009220 if (!sub) {
9221 Py_DECREF(str);
9222 return -2;
9223 }
9224 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9225 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009226 Py_DECREF(str);
9227 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009228 }
Tim Petersced69f82003-09-16 20:30:58 +00009229
Victor Stinner794d5672011-10-10 03:21:36 +02009230 result = any_find_slice(direction,
9231 str, sub, start, end
9232 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009233
Guido van Rossumd57fd912000-03-10 22:53:23 +00009234 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009235 Py_DECREF(sub);
9236
Guido van Rossumd57fd912000-03-10 22:53:23 +00009237 return result;
9238}
9239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009240Py_ssize_t
9241PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9242 Py_ssize_t start, Py_ssize_t end,
9243 int direction)
9244{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009245 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009246 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009247 if (PyUnicode_READY(str) == -1)
9248 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009249 if (start < 0 || end < 0) {
9250 PyErr_SetString(PyExc_IndexError, "string index out of range");
9251 return -2;
9252 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009253 if (end > PyUnicode_GET_LENGTH(str))
9254 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009255 if (start >= end)
9256 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009257 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009258 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9259 kind, end-start, ch, direction);
9260 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009261 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009262 else
9263 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009264}
9265
Alexander Belopolsky40018472011-02-26 01:02:56 +00009266static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009267tailmatch(PyObject *self,
9268 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009269 Py_ssize_t start,
9270 Py_ssize_t end,
9271 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009272{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009273 int kind_self;
9274 int kind_sub;
9275 void *data_self;
9276 void *data_sub;
9277 Py_ssize_t offset;
9278 Py_ssize_t i;
9279 Py_ssize_t end_sub;
9280
9281 if (PyUnicode_READY(self) == -1 ||
9282 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009283 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009285 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9286 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009287 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009288 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009289
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009290 if (PyUnicode_GET_LENGTH(substring) == 0)
9291 return 1;
9292
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009293 kind_self = PyUnicode_KIND(self);
9294 data_self = PyUnicode_DATA(self);
9295 kind_sub = PyUnicode_KIND(substring);
9296 data_sub = PyUnicode_DATA(substring);
9297 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9298
9299 if (direction > 0)
9300 offset = end;
9301 else
9302 offset = start;
9303
9304 if (PyUnicode_READ(kind_self, data_self, offset) ==
9305 PyUnicode_READ(kind_sub, data_sub, 0) &&
9306 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9307 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9308 /* If both are of the same kind, memcmp is sufficient */
9309 if (kind_self == kind_sub) {
9310 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009311 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009312 data_sub,
9313 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009314 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009315 }
9316 /* otherwise we have to compare each character by first accesing it */
9317 else {
9318 /* We do not need to compare 0 and len(substring)-1 because
9319 the if statement above ensured already that they are equal
9320 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009321 for (i = 1; i < end_sub; ++i) {
9322 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9323 PyUnicode_READ(kind_sub, data_sub, i))
9324 return 0;
9325 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009326 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009327 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009328 }
9329
9330 return 0;
9331}
9332
Alexander Belopolsky40018472011-02-26 01:02:56 +00009333Py_ssize_t
9334PyUnicode_Tailmatch(PyObject *str,
9335 PyObject *substr,
9336 Py_ssize_t start,
9337 Py_ssize_t end,
9338 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009339{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009340 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009341
Guido van Rossumd57fd912000-03-10 22:53:23 +00009342 str = PyUnicode_FromObject(str);
9343 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009344 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009345 substr = PyUnicode_FromObject(substr);
9346 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009347 Py_DECREF(str);
9348 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009349 }
Tim Petersced69f82003-09-16 20:30:58 +00009350
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009351 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009352 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009353 Py_DECREF(str);
9354 Py_DECREF(substr);
9355 return result;
9356}
9357
Guido van Rossumd57fd912000-03-10 22:53:23 +00009358/* Apply fixfct filter to the Unicode object self and return a
9359 reference to the modified object */
9360
Alexander Belopolsky40018472011-02-26 01:02:56 +00009361static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009362fixup(PyObject *self,
9363 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009364{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009365 PyObject *u;
9366 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009367 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009368
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009369 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009370 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009371 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009372 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009373
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009374 /* fix functions return the new maximum character in a string,
9375 if the kind of the resulting unicode object does not change,
9376 everything is fine. Otherwise we need to change the string kind
9377 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009378 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009379
9380 if (maxchar_new == 0) {
9381 /* no changes */;
9382 if (PyUnicode_CheckExact(self)) {
9383 Py_DECREF(u);
9384 Py_INCREF(self);
9385 return self;
9386 }
9387 else
9388 return u;
9389 }
9390
Victor Stinnere6abb482012-05-02 01:15:40 +02009391 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009392
Victor Stinnereaab6042011-12-11 22:22:39 +01009393 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009394 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009395
9396 /* In case the maximum character changed, we need to
9397 convert the string to the new category. */
9398 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9399 if (v == NULL) {
9400 Py_DECREF(u);
9401 return NULL;
9402 }
9403 if (maxchar_new > maxchar_old) {
9404 /* If the maxchar increased so that the kind changed, not all
9405 characters are representable anymore and we need to fix the
9406 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009407 _PyUnicode_FastCopyCharacters(v, 0,
9408 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009409 maxchar_old = fixfct(v);
9410 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009411 }
9412 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009413 _PyUnicode_FastCopyCharacters(v, 0,
9414 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009415 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009416 Py_DECREF(u);
9417 assert(_PyUnicode_CheckConsistency(v, 1));
9418 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009419}
9420
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009421static PyObject *
9422ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009423{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009424 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9425 char *resdata, *data = PyUnicode_DATA(self);
9426 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009427
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009428 res = PyUnicode_New(len, 127);
9429 if (res == NULL)
9430 return NULL;
9431 resdata = PyUnicode_DATA(res);
9432 if (lower)
9433 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009434 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009435 _Py_bytes_upper(resdata, data, len);
9436 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009437}
9438
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009439static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009440handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009441{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009442 Py_ssize_t j;
9443 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009444 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009445 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009446
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009447 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9448
9449 where ! is a negation and \p{xxx} is a character with property xxx.
9450 */
9451 for (j = i - 1; j >= 0; j--) {
9452 c = PyUnicode_READ(kind, data, j);
9453 if (!_PyUnicode_IsCaseIgnorable(c))
9454 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009455 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009456 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9457 if (final_sigma) {
9458 for (j = i + 1; j < length; j++) {
9459 c = PyUnicode_READ(kind, data, j);
9460 if (!_PyUnicode_IsCaseIgnorable(c))
9461 break;
9462 }
9463 final_sigma = j == length || !_PyUnicode_IsCased(c);
9464 }
9465 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009466}
9467
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009468static int
9469lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9470 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009471{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009472 /* Obscure special case. */
9473 if (c == 0x3A3) {
9474 mapped[0] = handle_capital_sigma(kind, data, length, i);
9475 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009476 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009477 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009478}
9479
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009480static Py_ssize_t
9481do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009482{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009483 Py_ssize_t i, k = 0;
9484 int n_res, j;
9485 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009486
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009487 c = PyUnicode_READ(kind, data, 0);
9488 n_res = _PyUnicode_ToUpperFull(c, mapped);
9489 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009490 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009491 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009492 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009493 for (i = 1; i < length; i++) {
9494 c = PyUnicode_READ(kind, data, i);
9495 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9496 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009497 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009498 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009499 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009500 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009501 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009502}
9503
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009504static Py_ssize_t
9505do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9506 Py_ssize_t i, k = 0;
9507
9508 for (i = 0; i < length; i++) {
9509 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9510 int n_res, j;
9511 if (Py_UNICODE_ISUPPER(c)) {
9512 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9513 }
9514 else if (Py_UNICODE_ISLOWER(c)) {
9515 n_res = _PyUnicode_ToUpperFull(c, mapped);
9516 }
9517 else {
9518 n_res = 1;
9519 mapped[0] = c;
9520 }
9521 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009522 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009523 res[k++] = mapped[j];
9524 }
9525 }
9526 return k;
9527}
9528
9529static Py_ssize_t
9530do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9531 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009532{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009533 Py_ssize_t i, k = 0;
9534
9535 for (i = 0; i < length; i++) {
9536 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9537 int n_res, j;
9538 if (lower)
9539 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9540 else
9541 n_res = _PyUnicode_ToUpperFull(c, mapped);
9542 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009543 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009544 res[k++] = mapped[j];
9545 }
9546 }
9547 return k;
9548}
9549
9550static Py_ssize_t
9551do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9552{
9553 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9554}
9555
9556static Py_ssize_t
9557do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9558{
9559 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9560}
9561
Benjamin Petersone51757f2012-01-12 21:10:29 -05009562static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009563do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9564{
9565 Py_ssize_t i, k = 0;
9566
9567 for (i = 0; i < length; i++) {
9568 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9569 Py_UCS4 mapped[3];
9570 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9571 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009572 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009573 res[k++] = mapped[j];
9574 }
9575 }
9576 return k;
9577}
9578
9579static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009580do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9581{
9582 Py_ssize_t i, k = 0;
9583 int previous_is_cased;
9584
9585 previous_is_cased = 0;
9586 for (i = 0; i < length; i++) {
9587 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9588 Py_UCS4 mapped[3];
9589 int n_res, j;
9590
9591 if (previous_is_cased)
9592 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9593 else
9594 n_res = _PyUnicode_ToTitleFull(c, mapped);
9595
9596 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009597 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009598 res[k++] = mapped[j];
9599 }
9600
9601 previous_is_cased = _PyUnicode_IsCased(c);
9602 }
9603 return k;
9604}
9605
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009606static PyObject *
9607case_operation(PyObject *self,
9608 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9609{
9610 PyObject *res = NULL;
9611 Py_ssize_t length, newlength = 0;
9612 int kind, outkind;
9613 void *data, *outdata;
9614 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9615
Benjamin Petersoneea48462012-01-16 14:28:50 -05009616 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009617
9618 kind = PyUnicode_KIND(self);
9619 data = PyUnicode_DATA(self);
9620 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009621 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009622 PyErr_SetString(PyExc_OverflowError, "string is too long");
9623 return NULL;
9624 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009625 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009626 if (tmp == NULL)
9627 return PyErr_NoMemory();
9628 newlength = perform(kind, data, length, tmp, &maxchar);
9629 res = PyUnicode_New(newlength, maxchar);
9630 if (res == NULL)
9631 goto leave;
9632 tmpend = tmp + newlength;
9633 outdata = PyUnicode_DATA(res);
9634 outkind = PyUnicode_KIND(res);
9635 switch (outkind) {
9636 case PyUnicode_1BYTE_KIND:
9637 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9638 break;
9639 case PyUnicode_2BYTE_KIND:
9640 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9641 break;
9642 case PyUnicode_4BYTE_KIND:
9643 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9644 break;
9645 default:
9646 assert(0);
9647 break;
9648 }
9649 leave:
9650 PyMem_FREE(tmp);
9651 return res;
9652}
9653
Tim Peters8ce9f162004-08-27 01:49:32 +00009654PyObject *
9655PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009656{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009657 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009658 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009659 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009660 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009661 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9662 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009663 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009664 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009665 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009666 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009667 int use_memcpy;
9668 unsigned char *res_data = NULL, *sep_data = NULL;
9669 PyObject *last_obj;
9670 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009671
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009672 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009673 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009674 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009675 }
9676
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009677 /* NOTE: the following code can't call back into Python code,
9678 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009679 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009680
Tim Peters05eba1f2004-08-27 21:32:02 +00009681 seqlen = PySequence_Fast_GET_SIZE(fseq);
9682 /* If empty sequence, return u"". */
9683 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009684 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009685 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009686 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009687
Tim Peters05eba1f2004-08-27 21:32:02 +00009688 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009689 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009690 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009691 if (seqlen == 1) {
9692 if (PyUnicode_CheckExact(items[0])) {
9693 res = items[0];
9694 Py_INCREF(res);
9695 Py_DECREF(fseq);
9696 return res;
9697 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009698 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009699 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009700 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009701 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009702 /* Set up sep and seplen */
9703 if (separator == NULL) {
9704 /* fall back to a blank space separator */
9705 sep = PyUnicode_FromOrdinal(' ');
9706 if (!sep)
9707 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009708 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009709 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009710 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009711 else {
9712 if (!PyUnicode_Check(separator)) {
9713 PyErr_Format(PyExc_TypeError,
9714 "separator: expected str instance,"
9715 " %.80s found",
9716 Py_TYPE(separator)->tp_name);
9717 goto onError;
9718 }
9719 if (PyUnicode_READY(separator))
9720 goto onError;
9721 sep = separator;
9722 seplen = PyUnicode_GET_LENGTH(separator);
9723 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9724 /* inc refcount to keep this code path symmetric with the
9725 above case of a blank separator */
9726 Py_INCREF(sep);
9727 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009728 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009729 }
9730
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009731 /* There are at least two things to join, or else we have a subclass
9732 * of str in the sequence.
9733 * Do a pre-pass to figure out the total amount of space we'll
9734 * need (sz), and see whether all argument are strings.
9735 */
9736 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009737#ifdef Py_DEBUG
9738 use_memcpy = 0;
9739#else
9740 use_memcpy = 1;
9741#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009742 for (i = 0; i < seqlen; i++) {
9743 const Py_ssize_t old_sz = sz;
9744 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009745 if (!PyUnicode_Check(item)) {
9746 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009747 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009748 " %.80s found",
9749 i, Py_TYPE(item)->tp_name);
9750 goto onError;
9751 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009752 if (PyUnicode_READY(item) == -1)
9753 goto onError;
9754 sz += PyUnicode_GET_LENGTH(item);
9755 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009756 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009757 if (i != 0)
9758 sz += seplen;
9759 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9760 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009761 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009762 goto onError;
9763 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009764 if (use_memcpy && last_obj != NULL) {
9765 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9766 use_memcpy = 0;
9767 }
9768 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009769 }
Tim Petersced69f82003-09-16 20:30:58 +00009770
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009771 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009772 if (res == NULL)
9773 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009774
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009775 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009776#ifdef Py_DEBUG
9777 use_memcpy = 0;
9778#else
9779 if (use_memcpy) {
9780 res_data = PyUnicode_1BYTE_DATA(res);
9781 kind = PyUnicode_KIND(res);
9782 if (seplen != 0)
9783 sep_data = PyUnicode_1BYTE_DATA(sep);
9784 }
9785#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009786 if (use_memcpy) {
9787 for (i = 0; i < seqlen; ++i) {
9788 Py_ssize_t itemlen;
9789 item = items[i];
9790
9791 /* Copy item, and maybe the separator. */
9792 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009793 Py_MEMCPY(res_data,
9794 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009795 kind * seplen);
9796 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009797 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009798
9799 itemlen = PyUnicode_GET_LENGTH(item);
9800 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009801 Py_MEMCPY(res_data,
9802 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009803 kind * itemlen);
9804 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009805 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009806 }
9807 assert(res_data == PyUnicode_1BYTE_DATA(res)
9808 + kind * PyUnicode_GET_LENGTH(res));
9809 }
9810 else {
9811 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9812 Py_ssize_t itemlen;
9813 item = items[i];
9814
9815 /* Copy item, and maybe the separator. */
9816 if (i && seplen != 0) {
9817 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9818 res_offset += seplen;
9819 }
9820
9821 itemlen = PyUnicode_GET_LENGTH(item);
9822 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009823 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009824 res_offset += itemlen;
9825 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009826 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009827 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009828 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009829
Tim Peters05eba1f2004-08-27 21:32:02 +00009830 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009831 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009832 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009833 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009834
Benjamin Peterson29060642009-01-31 22:14:21 +00009835 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009836 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009837 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009838 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009839 return NULL;
9840}
9841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009842#define FILL(kind, data, value, start, length) \
9843 do { \
9844 Py_ssize_t i_ = 0; \
9845 assert(kind != PyUnicode_WCHAR_KIND); \
9846 switch ((kind)) { \
9847 case PyUnicode_1BYTE_KIND: { \
9848 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009849 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009850 break; \
9851 } \
9852 case PyUnicode_2BYTE_KIND: { \
9853 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9854 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9855 break; \
9856 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009857 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009858 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9859 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9860 break; \
9861 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +02009862 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009863 } \
9864 } while (0)
9865
Victor Stinnerd3f08822012-05-29 12:57:52 +02009866void
9867_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9868 Py_UCS4 fill_char)
9869{
9870 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9871 const void *data = PyUnicode_DATA(unicode);
9872 assert(PyUnicode_IS_READY(unicode));
9873 assert(unicode_modifiable(unicode));
9874 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9875 assert(start >= 0);
9876 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9877 FILL(kind, data, fill_char, start, length);
9878}
9879
Victor Stinner3fe55312012-01-04 00:33:50 +01009880Py_ssize_t
9881PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9882 Py_UCS4 fill_char)
9883{
9884 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009885
9886 if (!PyUnicode_Check(unicode)) {
9887 PyErr_BadInternalCall();
9888 return -1;
9889 }
9890 if (PyUnicode_READY(unicode) == -1)
9891 return -1;
9892 if (unicode_check_modifiable(unicode))
9893 return -1;
9894
Victor Stinnerd3f08822012-05-29 12:57:52 +02009895 if (start < 0) {
9896 PyErr_SetString(PyExc_IndexError, "string index out of range");
9897 return -1;
9898 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009899 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9900 PyErr_SetString(PyExc_ValueError,
9901 "fill character is bigger than "
9902 "the string maximum character");
9903 return -1;
9904 }
9905
9906 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9907 length = Py_MIN(maxlen, length);
9908 if (length <= 0)
9909 return 0;
9910
Victor Stinnerd3f08822012-05-29 12:57:52 +02009911 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009912 return length;
9913}
9914
Victor Stinner9310abb2011-10-05 00:59:23 +02009915static PyObject *
9916pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009917 Py_ssize_t left,
9918 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009919 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009920{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009921 PyObject *u;
9922 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009923 int kind;
9924 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009925
9926 if (left < 0)
9927 left = 0;
9928 if (right < 0)
9929 right = 0;
9930
Victor Stinnerc4b49542011-12-11 22:44:26 +01009931 if (left == 0 && right == 0)
9932 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009933
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009934 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9935 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009936 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9937 return NULL;
9938 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009939 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009940 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009941 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009942 if (!u)
9943 return NULL;
9944
9945 kind = PyUnicode_KIND(u);
9946 data = PyUnicode_DATA(u);
9947 if (left)
9948 FILL(kind, data, fill, 0, left);
9949 if (right)
9950 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009951 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009952 assert(_PyUnicode_CheckConsistency(u, 1));
9953 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009954}
9955
Alexander Belopolsky40018472011-02-26 01:02:56 +00009956PyObject *
9957PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009958{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009959 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009960
9961 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009962 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009963 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009964 if (PyUnicode_READY(string) == -1) {
9965 Py_DECREF(string);
9966 return NULL;
9967 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009968
Benjamin Petersonead6b532011-12-20 17:23:42 -06009969 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009970 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009971 if (PyUnicode_IS_ASCII(string))
9972 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009973 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009974 PyUnicode_GET_LENGTH(string), keepends);
9975 else
9976 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009977 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009978 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009979 break;
9980 case PyUnicode_2BYTE_KIND:
9981 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009982 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009983 PyUnicode_GET_LENGTH(string), keepends);
9984 break;
9985 case PyUnicode_4BYTE_KIND:
9986 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009987 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009988 PyUnicode_GET_LENGTH(string), keepends);
9989 break;
9990 default:
9991 assert(0);
9992 list = 0;
9993 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009994 Py_DECREF(string);
9995 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009996}
9997
Alexander Belopolsky40018472011-02-26 01:02:56 +00009998static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009999split(PyObject *self,
10000 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010001 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010002{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010003 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010004 void *buf1, *buf2;
10005 Py_ssize_t len1, len2;
10006 PyObject* out;
10007
Guido van Rossumd57fd912000-03-10 22:53:23 +000010008 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010009 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010010
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010011 if (PyUnicode_READY(self) == -1)
10012 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010014 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010015 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010016 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010017 if (PyUnicode_IS_ASCII(self))
10018 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010019 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010020 PyUnicode_GET_LENGTH(self), maxcount
10021 );
10022 else
10023 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010024 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010025 PyUnicode_GET_LENGTH(self), maxcount
10026 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010027 case PyUnicode_2BYTE_KIND:
10028 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010029 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010030 PyUnicode_GET_LENGTH(self), maxcount
10031 );
10032 case PyUnicode_4BYTE_KIND:
10033 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010034 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010035 PyUnicode_GET_LENGTH(self), maxcount
10036 );
10037 default:
10038 assert(0);
10039 return NULL;
10040 }
10041
10042 if (PyUnicode_READY(substring) == -1)
10043 return NULL;
10044
10045 kind1 = PyUnicode_KIND(self);
10046 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010047 len1 = PyUnicode_GET_LENGTH(self);
10048 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010049 if (kind1 < kind2 || len1 < len2) {
10050 out = PyList_New(1);
10051 if (out == NULL)
10052 return NULL;
10053 Py_INCREF(self);
10054 PyList_SET_ITEM(out, 0, self);
10055 return out;
10056 }
10057 buf1 = PyUnicode_DATA(self);
10058 buf2 = PyUnicode_DATA(substring);
10059 if (kind2 != kind1) {
10060 buf2 = _PyUnicode_AsKind(substring, kind1);
10061 if (!buf2)
10062 return NULL;
10063 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010064
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010065 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010066 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010067 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10068 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010069 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010070 else
10071 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010072 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010073 break;
10074 case PyUnicode_2BYTE_KIND:
10075 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010076 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 break;
10078 case PyUnicode_4BYTE_KIND:
10079 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010080 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010081 break;
10082 default:
10083 out = NULL;
10084 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010085 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010086 PyMem_Free(buf2);
10087 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010088}
10089
Alexander Belopolsky40018472011-02-26 01:02:56 +000010090static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010091rsplit(PyObject *self,
10092 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010093 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010094{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010095 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010096 void *buf1, *buf2;
10097 Py_ssize_t len1, len2;
10098 PyObject* out;
10099
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010100 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010101 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010103 if (PyUnicode_READY(self) == -1)
10104 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010106 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010107 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010108 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010109 if (PyUnicode_IS_ASCII(self))
10110 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010111 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010112 PyUnicode_GET_LENGTH(self), maxcount
10113 );
10114 else
10115 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010116 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010117 PyUnicode_GET_LENGTH(self), maxcount
10118 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010119 case PyUnicode_2BYTE_KIND:
10120 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010121 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010122 PyUnicode_GET_LENGTH(self), maxcount
10123 );
10124 case PyUnicode_4BYTE_KIND:
10125 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010126 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010127 PyUnicode_GET_LENGTH(self), maxcount
10128 );
10129 default:
10130 assert(0);
10131 return NULL;
10132 }
10133
10134 if (PyUnicode_READY(substring) == -1)
10135 return NULL;
10136
10137 kind1 = PyUnicode_KIND(self);
10138 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010139 len1 = PyUnicode_GET_LENGTH(self);
10140 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010141 if (kind1 < kind2 || len1 < len2) {
10142 out = PyList_New(1);
10143 if (out == NULL)
10144 return NULL;
10145 Py_INCREF(self);
10146 PyList_SET_ITEM(out, 0, self);
10147 return out;
10148 }
10149 buf1 = PyUnicode_DATA(self);
10150 buf2 = PyUnicode_DATA(substring);
10151 if (kind2 != kind1) {
10152 buf2 = _PyUnicode_AsKind(substring, kind1);
10153 if (!buf2)
10154 return NULL;
10155 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010156
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010157 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010158 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010159 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10160 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010161 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010162 else
10163 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010164 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010165 break;
10166 case PyUnicode_2BYTE_KIND:
10167 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010168 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010169 break;
10170 case PyUnicode_4BYTE_KIND:
10171 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010172 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010173 break;
10174 default:
10175 out = NULL;
10176 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010177 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010178 PyMem_Free(buf2);
10179 return out;
10180}
10181
10182static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010183anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10184 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010186 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010188 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10189 return asciilib_find(buf1, len1, buf2, len2, offset);
10190 else
10191 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010192 case PyUnicode_2BYTE_KIND:
10193 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10194 case PyUnicode_4BYTE_KIND:
10195 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10196 }
10197 assert(0);
10198 return -1;
10199}
10200
10201static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010202anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10203 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010204{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010205 switch (kind) {
10206 case PyUnicode_1BYTE_KIND:
10207 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10208 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10209 else
10210 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10211 case PyUnicode_2BYTE_KIND:
10212 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10213 case PyUnicode_4BYTE_KIND:
10214 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10215 }
10216 assert(0);
10217 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010218}
10219
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010220static void
10221replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10222 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10223{
10224 int kind = PyUnicode_KIND(u);
10225 void *data = PyUnicode_DATA(u);
10226 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10227 if (kind == PyUnicode_1BYTE_KIND) {
10228 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10229 (Py_UCS1 *)data + len,
10230 u1, u2, maxcount);
10231 }
10232 else if (kind == PyUnicode_2BYTE_KIND) {
10233 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10234 (Py_UCS2 *)data + len,
10235 u1, u2, maxcount);
10236 }
10237 else {
10238 assert(kind == PyUnicode_4BYTE_KIND);
10239 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10240 (Py_UCS4 *)data + len,
10241 u1, u2, maxcount);
10242 }
10243}
10244
Alexander Belopolsky40018472011-02-26 01:02:56 +000010245static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010246replace(PyObject *self, PyObject *str1,
10247 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010248{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010249 PyObject *u;
10250 char *sbuf = PyUnicode_DATA(self);
10251 char *buf1 = PyUnicode_DATA(str1);
10252 char *buf2 = PyUnicode_DATA(str2);
10253 int srelease = 0, release1 = 0, release2 = 0;
10254 int skind = PyUnicode_KIND(self);
10255 int kind1 = PyUnicode_KIND(str1);
10256 int kind2 = PyUnicode_KIND(str2);
10257 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10258 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10259 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010260 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010261 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010262
10263 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010264 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010265 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010266 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010267
Victor Stinner59de0ee2011-10-07 10:01:28 +020010268 if (str1 == str2)
10269 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010270
Victor Stinner49a0a212011-10-12 23:46:10 +020010271 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010272 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10273 if (maxchar < maxchar_str1)
10274 /* substring too wide to be present */
10275 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010276 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10277 /* Replacing str1 with str2 may cause a maxchar reduction in the
10278 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010279 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010280 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010281
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010282 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010283 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010285 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010287 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010288 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010289 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010290
Victor Stinner69ed0f42013-04-09 21:48:24 +020010291 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010292 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010293 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010294 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010295 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010297 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010298 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010299
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010300 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10301 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010302 }
10303 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010304 int rkind = skind;
10305 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010306 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010307
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010308 if (kind1 < rkind) {
10309 /* widen substring */
10310 buf1 = _PyUnicode_AsKind(str1, rkind);
10311 if (!buf1) goto error;
10312 release1 = 1;
10313 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010314 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010315 if (i < 0)
10316 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 if (rkind > kind2) {
10318 /* widen replacement */
10319 buf2 = _PyUnicode_AsKind(str2, rkind);
10320 if (!buf2) goto error;
10321 release2 = 1;
10322 }
10323 else if (rkind < kind2) {
10324 /* widen self and buf1 */
10325 rkind = kind2;
10326 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010327 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010328 sbuf = _PyUnicode_AsKind(self, rkind);
10329 if (!sbuf) goto error;
10330 srelease = 1;
10331 buf1 = _PyUnicode_AsKind(str1, rkind);
10332 if (!buf1) goto error;
10333 release1 = 1;
10334 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010335 u = PyUnicode_New(slen, maxchar);
10336 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010338 assert(PyUnicode_KIND(u) == rkind);
10339 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010340
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010341 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010342 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010343 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010344 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010345 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010346 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010347
10348 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010349 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010350 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010351 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010352 if (i == -1)
10353 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010354 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010356 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010357 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010358 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010359 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010360 }
10361 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010362 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010363 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010364 int rkind = skind;
10365 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010366
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010367 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010368 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010369 buf1 = _PyUnicode_AsKind(str1, rkind);
10370 if (!buf1) goto error;
10371 release1 = 1;
10372 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010373 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010374 if (n == 0)
10375 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010376 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010377 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010378 buf2 = _PyUnicode_AsKind(str2, rkind);
10379 if (!buf2) goto error;
10380 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010381 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010383 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010384 rkind = kind2;
10385 sbuf = _PyUnicode_AsKind(self, rkind);
10386 if (!sbuf) goto error;
10387 srelease = 1;
10388 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010389 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010390 buf1 = _PyUnicode_AsKind(str1, rkind);
10391 if (!buf1) goto error;
10392 release1 = 1;
10393 }
10394 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10395 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010396 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010397 PyErr_SetString(PyExc_OverflowError,
10398 "replace string is too long");
10399 goto error;
10400 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010401 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010402 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010403 _Py_INCREF_UNICODE_EMPTY();
10404 if (!unicode_empty)
10405 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010406 u = unicode_empty;
10407 goto done;
10408 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010409 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010410 PyErr_SetString(PyExc_OverflowError,
10411 "replace string is too long");
10412 goto error;
10413 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010414 u = PyUnicode_New(new_size, maxchar);
10415 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010417 assert(PyUnicode_KIND(u) == rkind);
10418 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010419 ires = i = 0;
10420 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010421 while (n-- > 0) {
10422 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010423 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010424 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010425 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010426 if (j == -1)
10427 break;
10428 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010429 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010430 memcpy(res + rkind * ires,
10431 sbuf + rkind * i,
10432 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010433 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010434 }
10435 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010437 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010438 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010439 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010440 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010441 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010442 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010443 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010444 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010445 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010446 memcpy(res + rkind * ires,
10447 sbuf + rkind * i,
10448 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010449 }
10450 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010451 /* interleave */
10452 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010453 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010455 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010456 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010457 if (--n <= 0)
10458 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010459 memcpy(res + rkind * ires,
10460 sbuf + rkind * i,
10461 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 ires++;
10463 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010464 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010465 memcpy(res + rkind * ires,
10466 sbuf + rkind * i,
10467 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010468 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010469 }
10470
10471 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010472 unicode_adjust_maxchar(&u);
10473 if (u == NULL)
10474 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010475 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010476
10477 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 if (srelease)
10479 PyMem_FREE(sbuf);
10480 if (release1)
10481 PyMem_FREE(buf1);
10482 if (release2)
10483 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010484 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010486
Benjamin Peterson29060642009-01-31 22:14:21 +000010487 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010488 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 if (srelease)
10490 PyMem_FREE(sbuf);
10491 if (release1)
10492 PyMem_FREE(buf1);
10493 if (release2)
10494 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010495 return unicode_result_unchanged(self);
10496
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010497 error:
10498 if (srelease && sbuf)
10499 PyMem_FREE(sbuf);
10500 if (release1 && buf1)
10501 PyMem_FREE(buf1);
10502 if (release2 && buf2)
10503 PyMem_FREE(buf2);
10504 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010505}
10506
10507/* --- Unicode Object Methods --------------------------------------------- */
10508
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010509PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010510 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010511\n\
10512Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010513characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010514
10515static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010516unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010517{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010518 if (PyUnicode_READY(self) == -1)
10519 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010520 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010521}
10522
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010523PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010524 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010525\n\
10526Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010527have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010528
10529static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010530unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010531{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010532 if (PyUnicode_READY(self) == -1)
10533 return NULL;
10534 if (PyUnicode_GET_LENGTH(self) == 0)
10535 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010536 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010537}
10538
Benjamin Petersond5890c82012-01-14 13:23:30 -050010539PyDoc_STRVAR(casefold__doc__,
10540 "S.casefold() -> str\n\
10541\n\
10542Return a version of S suitable for caseless comparisons.");
10543
10544static PyObject *
10545unicode_casefold(PyObject *self)
10546{
10547 if (PyUnicode_READY(self) == -1)
10548 return NULL;
10549 if (PyUnicode_IS_ASCII(self))
10550 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010551 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010552}
10553
10554
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010555/* Argument converter. Coerces to a single unicode character */
10556
10557static int
10558convert_uc(PyObject *obj, void *addr)
10559{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010560 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010561 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010562
Benjamin Peterson14339b62009-01-31 16:36:08 +000010563 uniobj = PyUnicode_FromObject(obj);
10564 if (uniobj == NULL) {
10565 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010566 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010567 return 0;
10568 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010569 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010570 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010571 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010572 Py_DECREF(uniobj);
10573 return 0;
10574 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010575 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010576 Py_DECREF(uniobj);
10577 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010578}
10579
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010580PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010581 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010582\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010583Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010584done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010585
10586static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010587unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010588{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010589 Py_ssize_t marg, left;
10590 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 Py_UCS4 fillchar = ' ';
10592
Victor Stinnere9a29352011-10-01 02:14:59 +020010593 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010595
Benjamin Petersonbac79492012-01-14 13:34:47 -050010596 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010597 return NULL;
10598
Victor Stinnerc4b49542011-12-11 22:44:26 +010010599 if (PyUnicode_GET_LENGTH(self) >= width)
10600 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010601
Victor Stinnerc4b49542011-12-11 22:44:26 +010010602 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010603 left = marg / 2 + (marg & width & 1);
10604
Victor Stinner9310abb2011-10-05 00:59:23 +020010605 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010606}
10607
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608/* This function assumes that str1 and str2 are readied by the caller. */
10609
Marc-André Lemburge5034372000-08-08 08:04:29 +000010610static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010611unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010612{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010613#define COMPARE(TYPE1, TYPE2) \
10614 do { \
10615 TYPE1* p1 = (TYPE1 *)data1; \
10616 TYPE2* p2 = (TYPE2 *)data2; \
10617 TYPE1* end = p1 + len; \
10618 Py_UCS4 c1, c2; \
10619 for (; p1 != end; p1++, p2++) { \
10620 c1 = *p1; \
10621 c2 = *p2; \
10622 if (c1 != c2) \
10623 return (c1 < c2) ? -1 : 1; \
10624 } \
10625 } \
10626 while (0)
10627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628 int kind1, kind2;
10629 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010630 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010632 kind1 = PyUnicode_KIND(str1);
10633 kind2 = PyUnicode_KIND(str2);
10634 data1 = PyUnicode_DATA(str1);
10635 data2 = PyUnicode_DATA(str2);
10636 len1 = PyUnicode_GET_LENGTH(str1);
10637 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010638 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010639
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010640 switch(kind1) {
10641 case PyUnicode_1BYTE_KIND:
10642 {
10643 switch(kind2) {
10644 case PyUnicode_1BYTE_KIND:
10645 {
10646 int cmp = memcmp(data1, data2, len);
10647 /* normalize result of memcmp() into the range [-1; 1] */
10648 if (cmp < 0)
10649 return -1;
10650 if (cmp > 0)
10651 return 1;
10652 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010653 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010654 case PyUnicode_2BYTE_KIND:
10655 COMPARE(Py_UCS1, Py_UCS2);
10656 break;
10657 case PyUnicode_4BYTE_KIND:
10658 COMPARE(Py_UCS1, Py_UCS4);
10659 break;
10660 default:
10661 assert(0);
10662 }
10663 break;
10664 }
10665 case PyUnicode_2BYTE_KIND:
10666 {
10667 switch(kind2) {
10668 case PyUnicode_1BYTE_KIND:
10669 COMPARE(Py_UCS2, Py_UCS1);
10670 break;
10671 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010672 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010673 COMPARE(Py_UCS2, Py_UCS2);
10674 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010675 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010676 case PyUnicode_4BYTE_KIND:
10677 COMPARE(Py_UCS2, Py_UCS4);
10678 break;
10679 default:
10680 assert(0);
10681 }
10682 break;
10683 }
10684 case PyUnicode_4BYTE_KIND:
10685 {
10686 switch(kind2) {
10687 case PyUnicode_1BYTE_KIND:
10688 COMPARE(Py_UCS4, Py_UCS1);
10689 break;
10690 case PyUnicode_2BYTE_KIND:
10691 COMPARE(Py_UCS4, Py_UCS2);
10692 break;
10693 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010694 {
10695#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10696 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10697 /* normalize result of wmemcmp() into the range [-1; 1] */
10698 if (cmp < 0)
10699 return -1;
10700 if (cmp > 0)
10701 return 1;
10702#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010703 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010704#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010705 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010706 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010707 default:
10708 assert(0);
10709 }
10710 break;
10711 }
10712 default:
10713 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010714 }
10715
Victor Stinner770e19e2012-10-04 22:59:45 +020010716 if (len1 == len2)
10717 return 0;
10718 if (len1 < len2)
10719 return -1;
10720 else
10721 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010722
10723#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010724}
10725
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010726Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010727unicode_compare_eq(PyObject *str1, PyObject *str2)
10728{
10729 int kind;
10730 void *data1, *data2;
10731 Py_ssize_t len;
10732 int cmp;
10733
Victor Stinnere5567ad2012-10-23 02:48:49 +020010734 len = PyUnicode_GET_LENGTH(str1);
10735 if (PyUnicode_GET_LENGTH(str2) != len)
10736 return 0;
10737 kind = PyUnicode_KIND(str1);
10738 if (PyUnicode_KIND(str2) != kind)
10739 return 0;
10740 data1 = PyUnicode_DATA(str1);
10741 data2 = PyUnicode_DATA(str2);
10742
10743 cmp = memcmp(data1, data2, len * kind);
10744 return (cmp == 0);
10745}
10746
10747
Alexander Belopolsky40018472011-02-26 01:02:56 +000010748int
10749PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010750{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010751 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10752 if (PyUnicode_READY(left) == -1 ||
10753 PyUnicode_READY(right) == -1)
10754 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010755
10756 /* a string is equal to itself */
10757 if (left == right)
10758 return 0;
10759
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010760 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010761 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010762 PyErr_Format(PyExc_TypeError,
10763 "Can't compare %.100s and %.100s",
10764 left->ob_type->tp_name,
10765 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766 return -1;
10767}
10768
Martin v. Löwis5b222132007-06-10 09:51:05 +000010769int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010770_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10771{
10772 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10773 if (right_str == NULL)
10774 return -1;
10775 return PyUnicode_Compare(left, right_str);
10776}
10777
10778int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010779PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10780{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010781 Py_ssize_t i;
10782 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010783 Py_UCS4 chr;
10784
Victor Stinner910337b2011-10-03 03:20:16 +020010785 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010786 if (PyUnicode_READY(uni) == -1)
10787 return -1;
10788 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010789 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010790 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010791 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010792 size_t len, len2 = strlen(str);
10793 int cmp;
10794
10795 len = Py_MIN(len1, len2);
10796 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010797 if (cmp != 0) {
10798 if (cmp < 0)
10799 return -1;
10800 else
10801 return 1;
10802 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010803 if (len1 > len2)
10804 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010805 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010010806 return -1; /* str is longer */
10807 return 0;
10808 }
10809 else {
10810 void *data = PyUnicode_DATA(uni);
10811 /* Compare Unicode string and source character set string */
10812 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020010813 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010010814 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10815 /* This check keeps Python strings that end in '\0' from comparing equal
10816 to C strings identical up to that point. */
10817 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10818 return 1; /* uni is longer */
10819 if (str[i])
10820 return -1; /* str is longer */
10821 return 0;
10822 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010823}
10824
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010825
Benjamin Peterson29060642009-01-31 22:14:21 +000010826#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010827 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010828
Alexander Belopolsky40018472011-02-26 01:02:56 +000010829PyObject *
10830PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010831{
10832 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010833 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010834
Victor Stinnere5567ad2012-10-23 02:48:49 +020010835 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10836 Py_RETURN_NOTIMPLEMENTED;
10837
10838 if (PyUnicode_READY(left) == -1 ||
10839 PyUnicode_READY(right) == -1)
10840 return NULL;
10841
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010010842 if (left == right) {
10843 switch (op) {
10844 case Py_EQ:
10845 case Py_LE:
10846 case Py_GE:
10847 /* a string is equal to itself */
10848 v = Py_True;
10849 break;
10850 case Py_NE:
10851 case Py_LT:
10852 case Py_GT:
10853 v = Py_False;
10854 break;
10855 default:
10856 PyErr_BadArgument();
10857 return NULL;
10858 }
10859 }
10860 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020010861 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010862 result ^= (op == Py_NE);
10863 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020010864 }
10865 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010866 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010867
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010868 /* Convert the return value to a Boolean */
10869 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010870 case Py_LE:
10871 v = TEST_COND(result <= 0);
10872 break;
10873 case Py_GE:
10874 v = TEST_COND(result >= 0);
10875 break;
10876 case Py_LT:
10877 v = TEST_COND(result == -1);
10878 break;
10879 case Py_GT:
10880 v = TEST_COND(result == 1);
10881 break;
10882 default:
10883 PyErr_BadArgument();
10884 return NULL;
10885 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010886 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010887 Py_INCREF(v);
10888 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010889}
10890
Alexander Belopolsky40018472011-02-26 01:02:56 +000010891int
10892PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010893{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010894 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020010895 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010896 void *buf1, *buf2;
10897 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010898 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010899
10900 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010901 sub = PyUnicode_FromObject(element);
10902 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010903 PyErr_Format(PyExc_TypeError,
10904 "'in <string>' requires string as left operand, not %s",
10905 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010906 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010907 }
10908
Thomas Wouters477c8d52006-05-27 19:21:47 +000010909 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010910 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010911 Py_DECREF(sub);
10912 return -1;
10913 }
10914
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010915 kind1 = PyUnicode_KIND(str);
10916 kind2 = PyUnicode_KIND(sub);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010917 if (kind1 < kind2) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010918 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010919 Py_DECREF(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010920 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010921 }
10922 len1 = PyUnicode_GET_LENGTH(str);
10923 len2 = PyUnicode_GET_LENGTH(sub);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010924 if (len1 < len2) {
10925 Py_DECREF(sub);
10926 Py_DECREF(str);
10927 return 0;
10928 }
10929 buf1 = PyUnicode_DATA(str);
10930 buf2 = PyUnicode_DATA(sub);
10931 if (len2 == 1) {
10932 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
10933 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
10934 Py_DECREF(sub);
10935 Py_DECREF(str);
10936 return result;
10937 }
10938 if (kind2 != kind1) {
10939 buf2 = _PyUnicode_AsKind(sub, kind1);
10940 if (!buf2) {
10941 Py_DECREF(sub);
10942 Py_DECREF(str);
10943 return -1;
10944 }
10945 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010946
Victor Stinner77282cb2013-04-14 19:22:47 +020010947 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010948 case PyUnicode_1BYTE_KIND:
10949 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10950 break;
10951 case PyUnicode_2BYTE_KIND:
10952 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10953 break;
10954 case PyUnicode_4BYTE_KIND:
10955 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10956 break;
10957 default:
10958 result = -1;
10959 assert(0);
10960 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010961
10962 Py_DECREF(str);
10963 Py_DECREF(sub);
10964
Victor Stinner77282cb2013-04-14 19:22:47 +020010965 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010966 PyMem_Free(buf2);
10967
Guido van Rossum403d68b2000-03-13 15:55:09 +000010968 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010969}
10970
Guido van Rossumd57fd912000-03-10 22:53:23 +000010971/* Concat to string or Unicode object giving a new Unicode object. */
10972
Alexander Belopolsky40018472011-02-26 01:02:56 +000010973PyObject *
10974PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010975{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010976 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010977 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010978 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010979
10980 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010981 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010982 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010983 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010984 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010985 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010986 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010987
10988 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010989 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010990 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010991 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010992 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010993 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010994 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010995 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010996 }
10997
Victor Stinner488fa492011-12-12 00:01:39 +010010998 u_len = PyUnicode_GET_LENGTH(u);
10999 v_len = PyUnicode_GET_LENGTH(v);
11000 if (u_len > PY_SSIZE_T_MAX - v_len) {
11001 PyErr_SetString(PyExc_OverflowError,
11002 "strings are too large to concat");
11003 goto onError;
11004 }
11005 new_len = u_len + v_len;
11006
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011007 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011008 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011009 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011010
Guido van Rossumd57fd912000-03-10 22:53:23 +000011011 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011012 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011013 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011014 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011015 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
11016 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011017 Py_DECREF(u);
11018 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011019 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011020 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021
Benjamin Peterson29060642009-01-31 22:14:21 +000011022 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011023 Py_XDECREF(u);
11024 Py_XDECREF(v);
11025 return NULL;
11026}
11027
Walter Dörwald1ab83302007-05-18 17:15:44 +000011028void
Victor Stinner23e56682011-10-03 03:54:37 +020011029PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011030{
Victor Stinner23e56682011-10-03 03:54:37 +020011031 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011032 Py_UCS4 maxchar, maxchar2;
11033 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011034
11035 if (p_left == NULL) {
11036 if (!PyErr_Occurred())
11037 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011038 return;
11039 }
Victor Stinner23e56682011-10-03 03:54:37 +020011040 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011041 if (right == NULL || left == NULL
11042 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011043 if (!PyErr_Occurred())
11044 PyErr_BadInternalCall();
11045 goto error;
11046 }
11047
Benjamin Petersonbac79492012-01-14 13:34:47 -050011048 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011049 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011050 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011051 goto error;
11052
Victor Stinner488fa492011-12-12 00:01:39 +010011053 /* Shortcuts */
11054 if (left == unicode_empty) {
11055 Py_DECREF(left);
11056 Py_INCREF(right);
11057 *p_left = right;
11058 return;
11059 }
11060 if (right == unicode_empty)
11061 return;
11062
11063 left_len = PyUnicode_GET_LENGTH(left);
11064 right_len = PyUnicode_GET_LENGTH(right);
11065 if (left_len > PY_SSIZE_T_MAX - right_len) {
11066 PyErr_SetString(PyExc_OverflowError,
11067 "strings are too large to concat");
11068 goto error;
11069 }
11070 new_len = left_len + right_len;
11071
11072 if (unicode_modifiable(left)
11073 && PyUnicode_CheckExact(right)
11074 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011075 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11076 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011077 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011078 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011079 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11080 {
11081 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011082 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011083 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011084
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011085 /* copy 'right' into the newly allocated area of 'left' */
11086 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011087 }
Victor Stinner488fa492011-12-12 00:01:39 +010011088 else {
11089 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11090 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011091 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011092
Victor Stinner488fa492011-12-12 00:01:39 +010011093 /* Concat the two Unicode strings */
11094 res = PyUnicode_New(new_len, maxchar);
11095 if (res == NULL)
11096 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011097 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11098 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011099 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011100 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011101 }
11102 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011103 return;
11104
11105error:
Victor Stinner488fa492011-12-12 00:01:39 +010011106 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011107}
11108
11109void
11110PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11111{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011112 PyUnicode_Append(pleft, right);
11113 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011114}
11115
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011116PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011117 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011118\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011119Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011120string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011121interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011122
11123static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011124unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011125{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011126 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011127 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011128 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011129 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011130 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011131 void *buf1, *buf2;
11132 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011133
Jesus Ceaac451502011-04-20 17:09:23 +020011134 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11135 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011136 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011137
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011138 kind1 = PyUnicode_KIND(self);
11139 kind2 = PyUnicode_KIND(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011140 if (kind1 < kind2) {
Christian Heimesd47802e2013-06-29 21:33:36 +020011141 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011142 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020011143 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011144 len1 = PyUnicode_GET_LENGTH(self);
11145 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011146 ADJUST_INDICES(start, end, len1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011147 if (end - start < len2) {
11148 Py_DECREF(substring);
11149 return PyLong_FromLong(0);
11150 }
11151 buf1 = PyUnicode_DATA(self);
11152 buf2 = PyUnicode_DATA(substring);
11153 if (kind2 != kind1) {
11154 buf2 = _PyUnicode_AsKind(substring, kind1);
11155 if (!buf2) {
11156 Py_DECREF(substring);
11157 return NULL;
11158 }
11159 }
11160 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011161 case PyUnicode_1BYTE_KIND:
11162 iresult = ucs1lib_count(
11163 ((Py_UCS1*)buf1) + start, end - start,
11164 buf2, len2, PY_SSIZE_T_MAX
11165 );
11166 break;
11167 case PyUnicode_2BYTE_KIND:
11168 iresult = ucs2lib_count(
11169 ((Py_UCS2*)buf1) + start, end - start,
11170 buf2, len2, PY_SSIZE_T_MAX
11171 );
11172 break;
11173 case PyUnicode_4BYTE_KIND:
11174 iresult = ucs4lib_count(
11175 ((Py_UCS4*)buf1) + start, end - start,
11176 buf2, len2, PY_SSIZE_T_MAX
11177 );
11178 break;
11179 default:
11180 assert(0); iresult = 0;
11181 }
11182
11183 result = PyLong_FromSsize_t(iresult);
11184
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011185 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011186 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011187
11188 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011189
Guido van Rossumd57fd912000-03-10 22:53:23 +000011190 return result;
11191}
11192
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011193PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011194 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011195\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011196Encode S using the codec registered for encoding. Default encoding\n\
11197is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011198handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011199a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11200'xmlcharrefreplace' as well as any other name registered with\n\
11201codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011202
11203static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011204unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011205{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011206 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207 char *encoding = NULL;
11208 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011209
Benjamin Peterson308d6372009-09-18 21:42:35 +000011210 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11211 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011213 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011214}
11215
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011216PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011217 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011218\n\
11219Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011220If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011221
11222static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011223unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011224{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011225 Py_ssize_t i, j, line_pos, src_len, incr;
11226 Py_UCS4 ch;
11227 PyObject *u;
11228 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011229 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011230 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011231 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011232 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011233
Ezio Melotti745d54d2013-11-16 19:10:57 +020011234 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11235 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011237
Antoine Pitrou22425222011-10-04 19:10:51 +020011238 if (PyUnicode_READY(self) == -1)
11239 return NULL;
11240
Thomas Wouters7e474022000-07-16 12:04:32 +000011241 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011242 src_len = PyUnicode_GET_LENGTH(self);
11243 i = j = line_pos = 0;
11244 kind = PyUnicode_KIND(self);
11245 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011246 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011247 for (; i < src_len; i++) {
11248 ch = PyUnicode_READ(kind, src_data, i);
11249 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011250 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011251 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011252 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011253 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011254 goto overflow;
11255 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011256 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011257 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011258 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011259 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011260 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011261 goto overflow;
11262 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011263 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011264 if (ch == '\n' || ch == '\r')
11265 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011266 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011267 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011268 if (!found)
11269 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011270
Guido van Rossumd57fd912000-03-10 22:53:23 +000011271 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011272 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011273 if (!u)
11274 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011275 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276
Antoine Pitroue71d5742011-10-04 15:55:09 +020011277 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011278
Antoine Pitroue71d5742011-10-04 15:55:09 +020011279 for (; i < src_len; i++) {
11280 ch = PyUnicode_READ(kind, src_data, i);
11281 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011282 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011283 incr = tabsize - (line_pos % tabsize);
11284 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011285 FILL(kind, dest_data, ' ', j, incr);
11286 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011287 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011288 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011289 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011290 line_pos++;
11291 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011292 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011293 if (ch == '\n' || ch == '\r')
11294 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011296 }
11297 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011298 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011299
Antoine Pitroue71d5742011-10-04 15:55:09 +020011300 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011301 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11302 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011303}
11304
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011305PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011306 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011307\n\
11308Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011309such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011310arguments start and end are interpreted as in slice notation.\n\
11311\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011312Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011313
11314static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011315unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011316{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011317 /* initialize variables to prevent gcc warning */
11318 PyObject *substring = NULL;
11319 Py_ssize_t start = 0;
11320 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011321 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011322
Jesus Ceaac451502011-04-20 17:09:23 +020011323 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11324 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011325 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011326
Christian Heimesd47802e2013-06-29 21:33:36 +020011327 if (PyUnicode_READY(self) == -1) {
11328 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011329 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011330 }
11331 if (PyUnicode_READY(substring) == -1) {
11332 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011333 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011334 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011335
Victor Stinner7931d9a2011-11-04 00:22:48 +010011336 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011337
11338 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011339
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011340 if (result == -2)
11341 return NULL;
11342
Christian Heimes217cfd12007-12-02 14:31:20 +000011343 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011344}
11345
11346static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011347unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011348{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011349 void *data;
11350 enum PyUnicode_Kind kind;
11351 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011352
11353 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11354 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011355 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011356 }
11357 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11358 PyErr_SetString(PyExc_IndexError, "string index out of range");
11359 return NULL;
11360 }
11361 kind = PyUnicode_KIND(self);
11362 data = PyUnicode_DATA(self);
11363 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011364 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011365}
11366
Guido van Rossumc2504932007-09-18 19:42:40 +000011367/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011368 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011369static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011370unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011371{
Guido van Rossumc2504932007-09-18 19:42:40 +000011372 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011373 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011374
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011375#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011376 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011377#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011378 if (_PyUnicode_HASH(self) != -1)
11379 return _PyUnicode_HASH(self);
11380 if (PyUnicode_READY(self) == -1)
11381 return -1;
11382 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011383 /*
11384 We make the hash of the empty string be 0, rather than using
11385 (prefix ^ suffix), since this slightly obfuscates the hash secret
11386 */
11387 if (len == 0) {
11388 _PyUnicode_HASH(self) = 0;
11389 return 0;
11390 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011391 x = _Py_HashBytes(PyUnicode_DATA(self),
11392 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011393 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011394 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011395}
11396
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011397PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011398 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011399\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011400Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011401
11402static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011403unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011404{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011405 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011406 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011407 PyObject *substring = NULL;
11408 Py_ssize_t start = 0;
11409 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011410
Jesus Ceaac451502011-04-20 17:09:23 +020011411 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11412 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011413 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011414
Christian Heimesd47a0452013-06-29 21:21:37 +020011415 if (PyUnicode_READY(self) == -1) {
11416 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011417 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011418 }
11419 if (PyUnicode_READY(substring) == -1) {
11420 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011421 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011422 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011423
Victor Stinner7931d9a2011-11-04 00:22:48 +010011424 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011425
11426 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011427
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011428 if (result == -2)
11429 return NULL;
11430
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431 if (result < 0) {
11432 PyErr_SetString(PyExc_ValueError, "substring not found");
11433 return NULL;
11434 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011435
Christian Heimes217cfd12007-12-02 14:31:20 +000011436 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437}
11438
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011439PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011440 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011442Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011443at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011444
11445static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011446unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011448 Py_ssize_t i, length;
11449 int kind;
11450 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451 int cased;
11452
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011453 if (PyUnicode_READY(self) == -1)
11454 return NULL;
11455 length = PyUnicode_GET_LENGTH(self);
11456 kind = PyUnicode_KIND(self);
11457 data = PyUnicode_DATA(self);
11458
Guido van Rossumd57fd912000-03-10 22:53:23 +000011459 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011460 if (length == 1)
11461 return PyBool_FromLong(
11462 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011464 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011465 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011466 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011467
Guido van Rossumd57fd912000-03-10 22:53:23 +000011468 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011469 for (i = 0; i < length; i++) {
11470 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011471
Benjamin Peterson29060642009-01-31 22:14:21 +000011472 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11473 return PyBool_FromLong(0);
11474 else if (!cased && Py_UNICODE_ISLOWER(ch))
11475 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011476 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011477 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011478}
11479
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011480PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011481 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011483Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011484at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011485
11486static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011487unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011489 Py_ssize_t i, length;
11490 int kind;
11491 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492 int cased;
11493
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011494 if (PyUnicode_READY(self) == -1)
11495 return NULL;
11496 length = PyUnicode_GET_LENGTH(self);
11497 kind = PyUnicode_KIND(self);
11498 data = PyUnicode_DATA(self);
11499
Guido van Rossumd57fd912000-03-10 22:53:23 +000011500 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011501 if (length == 1)
11502 return PyBool_FromLong(
11503 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011505 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011506 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011507 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011508
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011510 for (i = 0; i < length; i++) {
11511 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011512
Benjamin Peterson29060642009-01-31 22:14:21 +000011513 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11514 return PyBool_FromLong(0);
11515 else if (!cased && Py_UNICODE_ISUPPER(ch))
11516 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011517 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011518 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011519}
11520
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011521PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011522 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011524Return True if S is a titlecased string and there is at least one\n\
11525character in S, i.e. upper- and titlecase characters may only\n\
11526follow uncased characters and lowercase characters only cased ones.\n\
11527Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528
11529static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011530unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011531{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011532 Py_ssize_t i, length;
11533 int kind;
11534 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011535 int cased, previous_is_cased;
11536
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011537 if (PyUnicode_READY(self) == -1)
11538 return NULL;
11539 length = PyUnicode_GET_LENGTH(self);
11540 kind = PyUnicode_KIND(self);
11541 data = PyUnicode_DATA(self);
11542
Guido van Rossumd57fd912000-03-10 22:53:23 +000011543 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011544 if (length == 1) {
11545 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11546 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11547 (Py_UNICODE_ISUPPER(ch) != 0));
11548 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011549
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011550 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011551 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011552 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011553
Guido van Rossumd57fd912000-03-10 22:53:23 +000011554 cased = 0;
11555 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011556 for (i = 0; i < length; i++) {
11557 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011558
Benjamin Peterson29060642009-01-31 22:14:21 +000011559 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11560 if (previous_is_cased)
11561 return PyBool_FromLong(0);
11562 previous_is_cased = 1;
11563 cased = 1;
11564 }
11565 else if (Py_UNICODE_ISLOWER(ch)) {
11566 if (!previous_is_cased)
11567 return PyBool_FromLong(0);
11568 previous_is_cased = 1;
11569 cased = 1;
11570 }
11571 else
11572 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011573 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011574 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575}
11576
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011577PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011578 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011579\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011580Return True if all characters in S are whitespace\n\
11581and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582
11583static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011584unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011586 Py_ssize_t i, length;
11587 int kind;
11588 void *data;
11589
11590 if (PyUnicode_READY(self) == -1)
11591 return NULL;
11592 length = PyUnicode_GET_LENGTH(self);
11593 kind = PyUnicode_KIND(self);
11594 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595
Guido van Rossumd57fd912000-03-10 22:53:23 +000011596 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011597 if (length == 1)
11598 return PyBool_FromLong(
11599 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011600
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011601 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011602 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011603 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011604
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011605 for (i = 0; i < length; i++) {
11606 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011607 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011608 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011609 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011610 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611}
11612
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011613PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011614 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011615\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011616Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011617and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011618
11619static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011620unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011621{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011622 Py_ssize_t i, length;
11623 int kind;
11624 void *data;
11625
11626 if (PyUnicode_READY(self) == -1)
11627 return NULL;
11628 length = PyUnicode_GET_LENGTH(self);
11629 kind = PyUnicode_KIND(self);
11630 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011631
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011632 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011633 if (length == 1)
11634 return PyBool_FromLong(
11635 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011636
11637 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011638 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011639 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011641 for (i = 0; i < length; i++) {
11642 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011643 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011644 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011645 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011646}
11647
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011648PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011649 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011650\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011651Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011652and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011653
11654static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011655unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011656{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011657 int kind;
11658 void *data;
11659 Py_ssize_t len, i;
11660
11661 if (PyUnicode_READY(self) == -1)
11662 return NULL;
11663
11664 kind = PyUnicode_KIND(self);
11665 data = PyUnicode_DATA(self);
11666 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011667
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011668 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011669 if (len == 1) {
11670 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11671 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11672 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011673
11674 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011675 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011676 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011677
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011678 for (i = 0; i < len; i++) {
11679 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011680 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011681 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011682 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011683 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011684}
11685
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011686PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011687 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011688\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011689Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011690False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691
11692static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011693unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011695 Py_ssize_t i, length;
11696 int kind;
11697 void *data;
11698
11699 if (PyUnicode_READY(self) == -1)
11700 return NULL;
11701 length = PyUnicode_GET_LENGTH(self);
11702 kind = PyUnicode_KIND(self);
11703 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011704
Guido van Rossumd57fd912000-03-10 22:53:23 +000011705 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011706 if (length == 1)
11707 return PyBool_FromLong(
11708 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011709
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011710 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011711 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011712 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011713
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011714 for (i = 0; i < length; i++) {
11715 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011716 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011718 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011719}
11720
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011721PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011722 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011724Return True if all characters in S are digits\n\
11725and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726
11727static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011728unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011729{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011730 Py_ssize_t i, length;
11731 int kind;
11732 void *data;
11733
11734 if (PyUnicode_READY(self) == -1)
11735 return NULL;
11736 length = PyUnicode_GET_LENGTH(self);
11737 kind = PyUnicode_KIND(self);
11738 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011739
Guido van Rossumd57fd912000-03-10 22:53:23 +000011740 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011741 if (length == 1) {
11742 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11743 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11744 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011745
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011746 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011747 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011748 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011749
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011750 for (i = 0; i < length; i++) {
11751 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011752 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011754 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011755}
11756
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011757PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011758 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011760Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011761False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762
11763static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011764unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011766 Py_ssize_t i, length;
11767 int kind;
11768 void *data;
11769
11770 if (PyUnicode_READY(self) == -1)
11771 return NULL;
11772 length = PyUnicode_GET_LENGTH(self);
11773 kind = PyUnicode_KIND(self);
11774 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011777 if (length == 1)
11778 return PyBool_FromLong(
11779 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011781 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011782 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011783 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011784
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011785 for (i = 0; i < length; i++) {
11786 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011787 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011789 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790}
11791
Martin v. Löwis47383402007-08-15 07:32:56 +000011792int
11793PyUnicode_IsIdentifier(PyObject *self)
11794{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011795 int kind;
11796 void *data;
11797 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011798 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011800 if (PyUnicode_READY(self) == -1) {
11801 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011802 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011803 }
11804
11805 /* Special case for empty strings */
11806 if (PyUnicode_GET_LENGTH(self) == 0)
11807 return 0;
11808 kind = PyUnicode_KIND(self);
11809 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011810
11811 /* PEP 3131 says that the first character must be in
11812 XID_Start and subsequent characters in XID_Continue,
11813 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011814 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011815 letters, digits, underscore). However, given the current
11816 definition of XID_Start and XID_Continue, it is sufficient
11817 to check just for these, except that _ must be allowed
11818 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011819 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011820 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011821 return 0;
11822
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011823 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011824 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011825 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011826 return 1;
11827}
11828
11829PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011830 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011831\n\
11832Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011833to the language definition.\n\
11834\n\
11835Use keyword.iskeyword() to test for reserved identifiers\n\
11836such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011837
11838static PyObject*
11839unicode_isidentifier(PyObject *self)
11840{
11841 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11842}
11843
Georg Brandl559e5d72008-06-11 18:37:52 +000011844PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011845 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011846\n\
11847Return True if all characters in S are considered\n\
11848printable in repr() or S is empty, False otherwise.");
11849
11850static PyObject*
11851unicode_isprintable(PyObject *self)
11852{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011853 Py_ssize_t i, length;
11854 int kind;
11855 void *data;
11856
11857 if (PyUnicode_READY(self) == -1)
11858 return NULL;
11859 length = PyUnicode_GET_LENGTH(self);
11860 kind = PyUnicode_KIND(self);
11861 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011862
11863 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011864 if (length == 1)
11865 return PyBool_FromLong(
11866 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011867
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011868 for (i = 0; i < length; i++) {
11869 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011870 Py_RETURN_FALSE;
11871 }
11872 }
11873 Py_RETURN_TRUE;
11874}
11875
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011876PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011877 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011878\n\
11879Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011880iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011881
11882static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011883unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011884{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011885 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886}
11887
Martin v. Löwis18e16552006-02-15 17:27:45 +000011888static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011889unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011891 if (PyUnicode_READY(self) == -1)
11892 return -1;
11893 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011894}
11895
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011896PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011897 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011899Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011900done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901
11902static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011903unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011905 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011906 Py_UCS4 fillchar = ' ';
11907
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011908 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011909 return NULL;
11910
Benjamin Petersonbac79492012-01-14 13:34:47 -050011911 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011912 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913
Victor Stinnerc4b49542011-12-11 22:44:26 +010011914 if (PyUnicode_GET_LENGTH(self) >= width)
11915 return unicode_result_unchanged(self);
11916
11917 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011918}
11919
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011920PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011921 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011923Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011924
11925static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011926unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011927{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011928 if (PyUnicode_READY(self) == -1)
11929 return NULL;
11930 if (PyUnicode_IS_ASCII(self))
11931 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011932 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011933}
11934
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011935#define LEFTSTRIP 0
11936#define RIGHTSTRIP 1
11937#define BOTHSTRIP 2
11938
11939/* Arrays indexed by above */
11940static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11941
11942#define STRIPNAME(i) (stripformat[i]+3)
11943
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011944/* externally visible for str.strip(unicode) */
11945PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011946_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011947{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011948 void *data;
11949 int kind;
11950 Py_ssize_t i, j, len;
11951 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011952 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011954 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11955 return NULL;
11956
11957 kind = PyUnicode_KIND(self);
11958 data = PyUnicode_DATA(self);
11959 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020011960 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011961 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11962 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020011963 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011964
Benjamin Peterson14339b62009-01-31 16:36:08 +000011965 i = 0;
11966 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011967 while (i < len) {
11968 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11969 if (!BLOOM(sepmask, ch))
11970 break;
11971 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11972 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011973 i++;
11974 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011975 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011976
Benjamin Peterson14339b62009-01-31 16:36:08 +000011977 j = len;
11978 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011979 j--;
11980 while (j >= i) {
11981 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11982 if (!BLOOM(sepmask, ch))
11983 break;
11984 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11985 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011986 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011987 }
11988
Benjamin Peterson29060642009-01-31 22:14:21 +000011989 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011990 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011991
Victor Stinner7931d9a2011-11-04 00:22:48 +010011992 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011993}
11994
11995PyObject*
11996PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11997{
11998 unsigned char *data;
11999 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012000 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012001
Victor Stinnerde636f32011-10-01 03:55:54 +020012002 if (PyUnicode_READY(self) == -1)
12003 return NULL;
12004
Victor Stinner684d5fd2012-05-03 02:32:34 +020012005 length = PyUnicode_GET_LENGTH(self);
12006 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012007
Victor Stinner684d5fd2012-05-03 02:32:34 +020012008 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012009 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012010
Victor Stinnerde636f32011-10-01 03:55:54 +020012011 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012012 PyErr_SetString(PyExc_IndexError, "string index out of range");
12013 return NULL;
12014 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012015 if (start >= length || end < start)
12016 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012017
Victor Stinner684d5fd2012-05-03 02:32:34 +020012018 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012019 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012020 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012021 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012022 }
12023 else {
12024 kind = PyUnicode_KIND(self);
12025 data = PyUnicode_1BYTE_DATA(self);
12026 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012027 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012028 length);
12029 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012030}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012031
12032static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012033do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012034{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035 Py_ssize_t len, i, j;
12036
12037 if (PyUnicode_READY(self) == -1)
12038 return NULL;
12039
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012040 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012041
Victor Stinnercc7af722013-04-09 22:39:24 +020012042 if (PyUnicode_IS_ASCII(self)) {
12043 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12044
12045 i = 0;
12046 if (striptype != RIGHTSTRIP) {
12047 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012048 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012049 if (!_Py_ascii_whitespace[ch])
12050 break;
12051 i++;
12052 }
12053 }
12054
12055 j = len;
12056 if (striptype != LEFTSTRIP) {
12057 j--;
12058 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012059 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012060 if (!_Py_ascii_whitespace[ch])
12061 break;
12062 j--;
12063 }
12064 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012065 }
12066 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012067 else {
12068 int kind = PyUnicode_KIND(self);
12069 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012070
Victor Stinnercc7af722013-04-09 22:39:24 +020012071 i = 0;
12072 if (striptype != RIGHTSTRIP) {
12073 while (i < len) {
12074 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12075 if (!Py_UNICODE_ISSPACE(ch))
12076 break;
12077 i++;
12078 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012079 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012080
12081 j = len;
12082 if (striptype != LEFTSTRIP) {
12083 j--;
12084 while (j >= i) {
12085 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12086 if (!Py_UNICODE_ISSPACE(ch))
12087 break;
12088 j--;
12089 }
12090 j++;
12091 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012092 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012093
Victor Stinner7931d9a2011-11-04 00:22:48 +010012094 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095}
12096
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012097
12098static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012099do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012100{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012101 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012102
Serhiy Storchakac6792272013-10-19 21:03:34 +030012103 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012104 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012105
Benjamin Peterson14339b62009-01-31 16:36:08 +000012106 if (sep != NULL && sep != Py_None) {
12107 if (PyUnicode_Check(sep))
12108 return _PyUnicode_XStrip(self, striptype, sep);
12109 else {
12110 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012111 "%s arg must be None or str",
12112 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012113 return NULL;
12114 }
12115 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012116
Benjamin Peterson14339b62009-01-31 16:36:08 +000012117 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012118}
12119
12120
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012121PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012122 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012123\n\
12124Return a copy of the string S with leading and trailing\n\
12125whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012126If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012127
12128static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012129unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012130{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012131 if (PyTuple_GET_SIZE(args) == 0)
12132 return do_strip(self, BOTHSTRIP); /* Common case */
12133 else
12134 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012135}
12136
12137
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012138PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012139 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012140\n\
12141Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012142If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012143
12144static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012145unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012146{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012147 if (PyTuple_GET_SIZE(args) == 0)
12148 return do_strip(self, LEFTSTRIP); /* Common case */
12149 else
12150 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012151}
12152
12153
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012154PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012155 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012156\n\
12157Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012158If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012159
12160static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012161unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012162{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012163 if (PyTuple_GET_SIZE(args) == 0)
12164 return do_strip(self, RIGHTSTRIP); /* Common case */
12165 else
12166 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012167}
12168
12169
Guido van Rossumd57fd912000-03-10 22:53:23 +000012170static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012171unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012172{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012173 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012174 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012175
Serhiy Storchaka05997252013-01-26 12:14:02 +020012176 if (len < 1)
12177 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012178
Victor Stinnerc4b49542011-12-11 22:44:26 +010012179 /* no repeat, return original string */
12180 if (len == 1)
12181 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012182
Benjamin Petersonbac79492012-01-14 13:34:47 -050012183 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012184 return NULL;
12185
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012186 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012187 PyErr_SetString(PyExc_OverflowError,
12188 "repeated string is too long");
12189 return NULL;
12190 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012191 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012192
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012193 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194 if (!u)
12195 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012196 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012197
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012198 if (PyUnicode_GET_LENGTH(str) == 1) {
12199 const int kind = PyUnicode_KIND(str);
12200 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012201 if (kind == PyUnicode_1BYTE_KIND) {
12202 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012203 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012204 }
12205 else if (kind == PyUnicode_2BYTE_KIND) {
12206 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012207 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012208 ucs2[n] = fill_char;
12209 } else {
12210 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12211 assert(kind == PyUnicode_4BYTE_KIND);
12212 for (n = 0; n < len; ++n)
12213 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012214 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012215 }
12216 else {
12217 /* number of characters copied this far */
12218 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012219 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012220 char *to = (char *) PyUnicode_DATA(u);
12221 Py_MEMCPY(to, PyUnicode_DATA(str),
12222 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012223 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012224 n = (done <= nchars-done) ? done : nchars-done;
12225 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012226 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012227 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012228 }
12229
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012230 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012231 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232}
12233
Alexander Belopolsky40018472011-02-26 01:02:56 +000012234PyObject *
12235PyUnicode_Replace(PyObject *obj,
12236 PyObject *subobj,
12237 PyObject *replobj,
12238 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012239{
12240 PyObject *self;
12241 PyObject *str1;
12242 PyObject *str2;
12243 PyObject *result;
12244
12245 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012246 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012247 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012248 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012249 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012250 Py_DECREF(self);
12251 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012252 }
12253 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012254 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012255 Py_DECREF(self);
12256 Py_DECREF(str1);
12257 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012258 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012259 if (PyUnicode_READY(self) == -1 ||
12260 PyUnicode_READY(str1) == -1 ||
12261 PyUnicode_READY(str2) == -1)
12262 result = NULL;
12263 else
12264 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012265 Py_DECREF(self);
12266 Py_DECREF(str1);
12267 Py_DECREF(str2);
12268 return result;
12269}
12270
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012271PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012272 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012273\n\
12274Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012275old replaced by new. If the optional argument count is\n\
12276given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012277
12278static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012279unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012280{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012281 PyObject *str1;
12282 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012283 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012284 PyObject *result;
12285
Martin v. Löwis18e16552006-02-15 17:27:45 +000012286 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012287 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012288 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012289 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012290 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012291 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012292 return NULL;
12293 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012294 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012295 Py_DECREF(str1);
12296 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012297 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012298 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12299 result = NULL;
12300 else
12301 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012302
12303 Py_DECREF(str1);
12304 Py_DECREF(str2);
12305 return result;
12306}
12307
Alexander Belopolsky40018472011-02-26 01:02:56 +000012308static PyObject *
12309unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012310{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012311 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012312 Py_ssize_t isize;
12313 Py_ssize_t osize, squote, dquote, i, o;
12314 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012315 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012316 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012317
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012318 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012319 return NULL;
12320
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012321 isize = PyUnicode_GET_LENGTH(unicode);
12322 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012324 /* Compute length of output, quote characters, and
12325 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012326 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012327 max = 127;
12328 squote = dquote = 0;
12329 ikind = PyUnicode_KIND(unicode);
12330 for (i = 0; i < isize; i++) {
12331 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012332 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012333 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012334 case '\'': squote++; break;
12335 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012336 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012337 incr = 2;
12338 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012339 default:
12340 /* Fast-path ASCII */
12341 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012342 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012343 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012344 ;
12345 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012346 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012347 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012348 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012349 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012350 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012352 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012353 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012354 if (osize > PY_SSIZE_T_MAX - incr) {
12355 PyErr_SetString(PyExc_OverflowError,
12356 "string is too long to generate repr");
12357 return NULL;
12358 }
12359 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012360 }
12361
12362 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012363 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012364 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012365 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012366 if (dquote)
12367 /* Both squote and dquote present. Use squote,
12368 and escape them */
12369 osize += squote;
12370 else
12371 quote = '"';
12372 }
Victor Stinner55c08782013-04-14 18:45:39 +020012373 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012374
12375 repr = PyUnicode_New(osize, max);
12376 if (repr == NULL)
12377 return NULL;
12378 okind = PyUnicode_KIND(repr);
12379 odata = PyUnicode_DATA(repr);
12380
12381 PyUnicode_WRITE(okind, odata, 0, quote);
12382 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012383 if (unchanged) {
12384 _PyUnicode_FastCopyCharacters(repr, 1,
12385 unicode, 0,
12386 isize);
12387 }
12388 else {
12389 for (i = 0, o = 1; i < isize; i++) {
12390 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012391
Victor Stinner55c08782013-04-14 18:45:39 +020012392 /* Escape quotes and backslashes */
12393 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012394 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012395 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012396 continue;
12397 }
12398
12399 /* Map special whitespace to '\t', \n', '\r' */
12400 if (ch == '\t') {
12401 PyUnicode_WRITE(okind, odata, o++, '\\');
12402 PyUnicode_WRITE(okind, odata, o++, 't');
12403 }
12404 else if (ch == '\n') {
12405 PyUnicode_WRITE(okind, odata, o++, '\\');
12406 PyUnicode_WRITE(okind, odata, o++, 'n');
12407 }
12408 else if (ch == '\r') {
12409 PyUnicode_WRITE(okind, odata, o++, '\\');
12410 PyUnicode_WRITE(okind, odata, o++, 'r');
12411 }
12412
12413 /* Map non-printable US ASCII to '\xhh' */
12414 else if (ch < ' ' || ch == 0x7F) {
12415 PyUnicode_WRITE(okind, odata, o++, '\\');
12416 PyUnicode_WRITE(okind, odata, o++, 'x');
12417 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12418 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12419 }
12420
12421 /* Copy ASCII characters as-is */
12422 else if (ch < 0x7F) {
12423 PyUnicode_WRITE(okind, odata, o++, ch);
12424 }
12425
12426 /* Non-ASCII characters */
12427 else {
12428 /* Map Unicode whitespace and control characters
12429 (categories Z* and C* except ASCII space)
12430 */
12431 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12432 PyUnicode_WRITE(okind, odata, o++, '\\');
12433 /* Map 8-bit characters to '\xhh' */
12434 if (ch <= 0xff) {
12435 PyUnicode_WRITE(okind, odata, o++, 'x');
12436 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12437 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12438 }
12439 /* Map 16-bit characters to '\uxxxx' */
12440 else if (ch <= 0xffff) {
12441 PyUnicode_WRITE(okind, odata, o++, 'u');
12442 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12443 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12444 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12445 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12446 }
12447 /* Map 21-bit characters to '\U00xxxxxx' */
12448 else {
12449 PyUnicode_WRITE(okind, odata, o++, 'U');
12450 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12451 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12452 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12453 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12454 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12455 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12456 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12457 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12458 }
12459 }
12460 /* Copy characters as-is */
12461 else {
12462 PyUnicode_WRITE(okind, odata, o++, ch);
12463 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012464 }
12465 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012466 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012467 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012468 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012469 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012470}
12471
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012472PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012473 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012474\n\
12475Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012476such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012477arguments start and end are interpreted as in slice notation.\n\
12478\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012479Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012480
12481static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012482unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012483{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012484 /* initialize variables to prevent gcc warning */
12485 PyObject *substring = NULL;
12486 Py_ssize_t start = 0;
12487 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012488 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012489
Jesus Ceaac451502011-04-20 17:09:23 +020012490 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12491 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012492 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012493
Christian Heimesea71a522013-06-29 21:17:34 +020012494 if (PyUnicode_READY(self) == -1) {
12495 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012496 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012497 }
12498 if (PyUnicode_READY(substring) == -1) {
12499 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012500 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012501 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012502
Victor Stinner7931d9a2011-11-04 00:22:48 +010012503 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012504
12505 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012506
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012507 if (result == -2)
12508 return NULL;
12509
Christian Heimes217cfd12007-12-02 14:31:20 +000012510 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012511}
12512
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012513PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012514 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012516Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012517
12518static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012519unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012520{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012521 /* initialize variables to prevent gcc warning */
12522 PyObject *substring = NULL;
12523 Py_ssize_t start = 0;
12524 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012525 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012526
Jesus Ceaac451502011-04-20 17:09:23 +020012527 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12528 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012529 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530
Christian Heimesea71a522013-06-29 21:17:34 +020012531 if (PyUnicode_READY(self) == -1) {
12532 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012533 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012534 }
12535 if (PyUnicode_READY(substring) == -1) {
12536 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012537 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012538 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012539
Victor Stinner7931d9a2011-11-04 00:22:48 +010012540 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012541
12542 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012544 if (result == -2)
12545 return NULL;
12546
Guido van Rossumd57fd912000-03-10 22:53:23 +000012547 if (result < 0) {
12548 PyErr_SetString(PyExc_ValueError, "substring not found");
12549 return NULL;
12550 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012551
Christian Heimes217cfd12007-12-02 14:31:20 +000012552 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012553}
12554
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012555PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012556 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012557\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012558Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012559done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012560
12561static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012562unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012563{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012564 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012565 Py_UCS4 fillchar = ' ';
12566
Victor Stinnere9a29352011-10-01 02:14:59 +020012567 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012568 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012569
Benjamin Petersonbac79492012-01-14 13:34:47 -050012570 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012571 return NULL;
12572
Victor Stinnerc4b49542011-12-11 22:44:26 +010012573 if (PyUnicode_GET_LENGTH(self) >= width)
12574 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012575
Victor Stinnerc4b49542011-12-11 22:44:26 +010012576 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012577}
12578
Alexander Belopolsky40018472011-02-26 01:02:56 +000012579PyObject *
12580PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012581{
12582 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012583
Guido van Rossumd57fd912000-03-10 22:53:23 +000012584 s = PyUnicode_FromObject(s);
12585 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012586 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012587 if (sep != NULL) {
12588 sep = PyUnicode_FromObject(sep);
12589 if (sep == NULL) {
12590 Py_DECREF(s);
12591 return NULL;
12592 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012593 }
12594
Victor Stinner9310abb2011-10-05 00:59:23 +020012595 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012596
12597 Py_DECREF(s);
12598 Py_XDECREF(sep);
12599 return result;
12600}
12601
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012602PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012603 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012604\n\
12605Return a list of the words in S, using sep as the\n\
12606delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012607splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012608whitespace string is a separator and empty strings are\n\
12609removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012610
12611static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012612unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012613{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012614 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012615 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012616 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012617
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012618 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12619 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012620 return NULL;
12621
12622 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012623 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012624 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012625 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012626 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012627 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012628}
12629
Thomas Wouters477c8d52006-05-27 19:21:47 +000012630PyObject *
12631PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12632{
12633 PyObject* str_obj;
12634 PyObject* sep_obj;
12635 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012636 int kind1, kind2;
12637 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012638 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012639
12640 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012641 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012642 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012643 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012644 if (!sep_obj) {
12645 Py_DECREF(str_obj);
12646 return NULL;
12647 }
12648 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12649 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012650 Py_DECREF(str_obj);
12651 return NULL;
12652 }
12653
Victor Stinner14f8f022011-10-05 20:58:25 +020012654 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012655 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012656 len1 = PyUnicode_GET_LENGTH(str_obj);
12657 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012658 if (kind1 < kind2 || len1 < len2) {
12659 _Py_INCREF_UNICODE_EMPTY();
12660 if (!unicode_empty)
12661 out = NULL;
12662 else {
12663 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12664 Py_DECREF(unicode_empty);
12665 }
12666 Py_DECREF(sep_obj);
12667 Py_DECREF(str_obj);
12668 return out;
12669 }
12670 buf1 = PyUnicode_DATA(str_obj);
12671 buf2 = PyUnicode_DATA(sep_obj);
12672 if (kind2 != kind1) {
12673 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12674 if (!buf2)
12675 goto onError;
12676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012677
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012678 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012679 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012680 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12681 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12682 else
12683 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012684 break;
12685 case PyUnicode_2BYTE_KIND:
12686 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12687 break;
12688 case PyUnicode_4BYTE_KIND:
12689 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12690 break;
12691 default:
12692 assert(0);
12693 out = 0;
12694 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012695
12696 Py_DECREF(sep_obj);
12697 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012698 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012699 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012700
12701 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012702 onError:
12703 Py_DECREF(sep_obj);
12704 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012705 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012706 PyMem_Free(buf2);
12707 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012708}
12709
12710
12711PyObject *
12712PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12713{
12714 PyObject* str_obj;
12715 PyObject* sep_obj;
12716 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012717 int kind1, kind2;
12718 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012719 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012720
12721 str_obj = PyUnicode_FromObject(str_in);
12722 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012723 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012724 sep_obj = PyUnicode_FromObject(sep_in);
12725 if (!sep_obj) {
12726 Py_DECREF(str_obj);
12727 return NULL;
12728 }
12729
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012730 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012731 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012732 len1 = PyUnicode_GET_LENGTH(str_obj);
12733 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012734 if (kind1 < kind2 || len1 < len2) {
12735 _Py_INCREF_UNICODE_EMPTY();
12736 if (!unicode_empty)
12737 out = NULL;
12738 else {
12739 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12740 Py_DECREF(unicode_empty);
12741 }
12742 Py_DECREF(sep_obj);
12743 Py_DECREF(str_obj);
12744 return out;
12745 }
12746 buf1 = PyUnicode_DATA(str_obj);
12747 buf2 = PyUnicode_DATA(sep_obj);
12748 if (kind2 != kind1) {
12749 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12750 if (!buf2)
12751 goto onError;
12752 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012753
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012754 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012755 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012756 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12757 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12758 else
12759 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012760 break;
12761 case PyUnicode_2BYTE_KIND:
12762 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12763 break;
12764 case PyUnicode_4BYTE_KIND:
12765 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12766 break;
12767 default:
12768 assert(0);
12769 out = 0;
12770 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012771
12772 Py_DECREF(sep_obj);
12773 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012774 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012775 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012776
12777 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012778 onError:
12779 Py_DECREF(sep_obj);
12780 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012781 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012782 PyMem_Free(buf2);
12783 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012784}
12785
12786PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012787 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012788\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012789Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012790the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012791found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012792
12793static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012794unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012795{
Victor Stinner9310abb2011-10-05 00:59:23 +020012796 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012797}
12798
12799PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012800 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012801\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012802Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012803the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012804separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012805
12806static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012807unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012808{
Victor Stinner9310abb2011-10-05 00:59:23 +020012809 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012810}
12811
Alexander Belopolsky40018472011-02-26 01:02:56 +000012812PyObject *
12813PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012814{
12815 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012816
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012817 s = PyUnicode_FromObject(s);
12818 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012819 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012820 if (sep != NULL) {
12821 sep = PyUnicode_FromObject(sep);
12822 if (sep == NULL) {
12823 Py_DECREF(s);
12824 return NULL;
12825 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012826 }
12827
Victor Stinner9310abb2011-10-05 00:59:23 +020012828 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012829
12830 Py_DECREF(s);
12831 Py_XDECREF(sep);
12832 return result;
12833}
12834
12835PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012836 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012837\n\
12838Return a list of the words in S, using sep as the\n\
12839delimiter string, starting at the end of the string and\n\
12840working to the front. If maxsplit is given, at most maxsplit\n\
12841splits are done. If sep is not specified, any whitespace string\n\
12842is a separator.");
12843
12844static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012845unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012846{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012847 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012848 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012849 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012850
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012851 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12852 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012853 return NULL;
12854
12855 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012856 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012857 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012858 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012859 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012860 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012861}
12862
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012863PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012864 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012865\n\
12866Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012867Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012868is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012869
12870static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012871unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012872{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012873 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012874 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012875
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012876 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12877 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012878 return NULL;
12879
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012880 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012881}
12882
12883static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012884PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012885{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012886 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012887}
12888
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012889PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012890 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012891\n\
12892Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012893and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012894
12895static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012896unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012897{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012898 if (PyUnicode_READY(self) == -1)
12899 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012900 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012901}
12902
Larry Hastings61272b72014-01-07 12:41:53 -080012903/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012904
Larry Hastings31826802013-10-19 00:09:25 -070012905@staticmethod
12906str.maketrans as unicode_maketrans
12907
12908 x: object
12909
12910 y: unicode=NULL
12911
12912 z: unicode=NULL
12913
12914 /
12915
12916Return a translation table usable for str.translate().
12917
12918If there is only one argument, it must be a dictionary mapping Unicode
12919ordinals (integers) or characters to Unicode ordinals, strings or None.
12920Character keys will be then converted to ordinals.
12921If there are two arguments, they must be strings of equal length, and
12922in the resulting dictionary, each character in x will be mapped to the
12923character at the same position in y. If there is a third argument, it
12924must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080012925[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070012926
Larry Hastings31826802013-10-19 00:09:25 -070012927static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012928unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030012929/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070012930{
Georg Brandlceee0772007-11-27 23:48:05 +000012931 PyObject *new = NULL, *key, *value;
12932 Py_ssize_t i = 0;
12933 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012934
Georg Brandlceee0772007-11-27 23:48:05 +000012935 new = PyDict_New();
12936 if (!new)
12937 return NULL;
12938 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012939 int x_kind, y_kind, z_kind;
12940 void *x_data, *y_data, *z_data;
12941
Georg Brandlceee0772007-11-27 23:48:05 +000012942 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012943 if (!PyUnicode_Check(x)) {
12944 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12945 "be a string if there is a second argument");
12946 goto err;
12947 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012948 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012949 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12950 "arguments must have equal length");
12951 goto err;
12952 }
12953 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012954 x_kind = PyUnicode_KIND(x);
12955 y_kind = PyUnicode_KIND(y);
12956 x_data = PyUnicode_DATA(x);
12957 y_data = PyUnicode_DATA(y);
12958 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12959 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012960 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012961 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012962 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012963 if (!value) {
12964 Py_DECREF(key);
12965 goto err;
12966 }
Georg Brandlceee0772007-11-27 23:48:05 +000012967 res = PyDict_SetItem(new, key, value);
12968 Py_DECREF(key);
12969 Py_DECREF(value);
12970 if (res < 0)
12971 goto err;
12972 }
12973 /* create entries for deleting chars in z */
12974 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012975 z_kind = PyUnicode_KIND(z);
12976 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012977 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012978 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012979 if (!key)
12980 goto err;
12981 res = PyDict_SetItem(new, key, Py_None);
12982 Py_DECREF(key);
12983 if (res < 0)
12984 goto err;
12985 }
12986 }
12987 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012988 int kind;
12989 void *data;
12990
Georg Brandlceee0772007-11-27 23:48:05 +000012991 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012992 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012993 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12994 "to maketrans it must be a dict");
12995 goto err;
12996 }
12997 /* copy entries into the new dict, converting string keys to int keys */
12998 while (PyDict_Next(x, &i, &key, &value)) {
12999 if (PyUnicode_Check(key)) {
13000 /* convert string keys to integer keys */
13001 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013002 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013003 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13004 "table must be of length 1");
13005 goto err;
13006 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013007 kind = PyUnicode_KIND(key);
13008 data = PyUnicode_DATA(key);
13009 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013010 if (!newkey)
13011 goto err;
13012 res = PyDict_SetItem(new, newkey, value);
13013 Py_DECREF(newkey);
13014 if (res < 0)
13015 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013016 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013017 /* just keep integer keys */
13018 if (PyDict_SetItem(new, key, value) < 0)
13019 goto err;
13020 } else {
13021 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13022 "be strings or integers");
13023 goto err;
13024 }
13025 }
13026 }
13027 return new;
13028 err:
13029 Py_DECREF(new);
13030 return NULL;
13031}
13032
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013033PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013034 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013035\n\
Zachary Ware79b98df2015-08-05 23:54:15 -050013036Return a copy of the string S in which each character has been mapped\n\
13037through the given translation table. The table must implement\n\
13038lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13039mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13040this operation raises LookupError, the character is left untouched.\n\
13041Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013042
13043static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013044unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013045{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013046 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013047}
13048
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013049PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013050 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013051\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013052Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013053
13054static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013055unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013056{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013057 if (PyUnicode_READY(self) == -1)
13058 return NULL;
13059 if (PyUnicode_IS_ASCII(self))
13060 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013061 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013062}
13063
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013064PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013065 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013066\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013067Pad a numeric string S with zeros on the left, to fill a field\n\
13068of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013069
13070static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013071unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013072{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013073 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013074 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013075 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013076 int kind;
13077 void *data;
13078 Py_UCS4 chr;
13079
Martin v. Löwis18e16552006-02-15 17:27:45 +000013080 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013081 return NULL;
13082
Benjamin Petersonbac79492012-01-14 13:34:47 -050013083 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013084 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013085
Victor Stinnerc4b49542011-12-11 22:44:26 +010013086 if (PyUnicode_GET_LENGTH(self) >= width)
13087 return unicode_result_unchanged(self);
13088
13089 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013090
13091 u = pad(self, fill, 0, '0');
13092
Walter Dörwald068325e2002-04-15 13:36:47 +000013093 if (u == NULL)
13094 return NULL;
13095
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013096 kind = PyUnicode_KIND(u);
13097 data = PyUnicode_DATA(u);
13098 chr = PyUnicode_READ(kind, data, fill);
13099
13100 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013101 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013102 PyUnicode_WRITE(kind, data, 0, chr);
13103 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013104 }
13105
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013106 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013107 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013108}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013109
13110#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013111static PyObject *
13112unicode__decimal2ascii(PyObject *self)
13113{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013114 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013115}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013116#endif
13117
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013118PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013119 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013120\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013121Return True if S starts with the specified prefix, False otherwise.\n\
13122With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013123With optional end, stop comparing S at that position.\n\
13124prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013125
13126static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013127unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013128 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013130 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013131 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013132 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013133 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013134 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013135
Jesus Ceaac451502011-04-20 17:09:23 +020013136 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013137 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013138 if (PyTuple_Check(subobj)) {
13139 Py_ssize_t i;
13140 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013141 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013142 if (substring == NULL)
13143 return NULL;
13144 result = tailmatch(self, substring, start, end, -1);
13145 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013146 if (result == -1)
13147 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013148 if (result) {
13149 Py_RETURN_TRUE;
13150 }
13151 }
13152 /* nothing matched */
13153 Py_RETURN_FALSE;
13154 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013155 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013156 if (substring == NULL) {
13157 if (PyErr_ExceptionMatches(PyExc_TypeError))
13158 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13159 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013160 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013161 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013162 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013163 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013164 if (result == -1)
13165 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013166 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013167}
13168
13169
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013170PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013171 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013172\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013173Return True if S ends with the specified suffix, False otherwise.\n\
13174With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013175With optional end, stop comparing S at that position.\n\
13176suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013177
13178static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013179unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013180 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013181{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013182 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013183 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013184 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013185 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013186 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013187
Jesus Ceaac451502011-04-20 17:09:23 +020013188 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013189 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013190 if (PyTuple_Check(subobj)) {
13191 Py_ssize_t i;
13192 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013193 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013194 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013195 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013196 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013197 result = tailmatch(self, substring, start, end, +1);
13198 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013199 if (result == -1)
13200 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013201 if (result) {
13202 Py_RETURN_TRUE;
13203 }
13204 }
13205 Py_RETURN_FALSE;
13206 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013207 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013208 if (substring == NULL) {
13209 if (PyErr_ExceptionMatches(PyExc_TypeError))
13210 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13211 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013212 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013213 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013214 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020013215 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013216 if (result == -1)
13217 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013218 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013219}
13220
Victor Stinner202fdca2012-05-07 12:47:02 +020013221Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013222_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013223{
Victor Stinner8f674cc2013-04-17 23:02:17 +020013224 if (!writer->readonly)
13225 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13226 else {
13227 /* Copy-on-write mode: set buffer size to 0 so
13228 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13229 * next write. */
13230 writer->size = 0;
13231 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013232 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13233 writer->data = PyUnicode_DATA(writer->buffer);
13234 writer->kind = PyUnicode_KIND(writer->buffer);
13235}
13236
Victor Stinnerd3f08822012-05-29 12:57:52 +020013237void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013238_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013239{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013240 memset(writer, 0, sizeof(*writer));
13241#ifdef Py_DEBUG
13242 writer->kind = 5; /* invalid kind */
13243#endif
Victor Stinner8f674cc2013-04-17 23:02:17 +020013244 writer->min_char = 127;
Victor Stinner202fdca2012-05-07 12:47:02 +020013245}
13246
Victor Stinnerd3f08822012-05-29 12:57:52 +020013247int
13248_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13249 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013250{
Victor Stinner6989ba02013-11-18 21:08:39 +010013251#ifdef MS_WINDOWS
13252 /* On Windows, overallocate by 50% is the best factor */
13253# define OVERALLOCATE_FACTOR 2
13254#else
13255 /* On Linux, overallocate by 25% is the best factor */
13256# define OVERALLOCATE_FACTOR 4
13257#endif
Victor Stinner202fdca2012-05-07 12:47:02 +020013258 Py_ssize_t newlen;
13259 PyObject *newbuffer;
13260
Victor Stinnerd3f08822012-05-29 12:57:52 +020013261 assert(length > 0);
13262
Victor Stinner202fdca2012-05-07 12:47:02 +020013263 if (length > PY_SSIZE_T_MAX - writer->pos) {
13264 PyErr_NoMemory();
13265 return -1;
13266 }
13267 newlen = writer->pos + length;
13268
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013269 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013270
Victor Stinnerd3f08822012-05-29 12:57:52 +020013271 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013272 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013273 if (writer->overallocate
13274 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13275 /* overallocate to limit the number of realloc() */
13276 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013277 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013278 if (newlen < writer->min_length)
13279 newlen = writer->min_length;
13280
Victor Stinnerd3f08822012-05-29 12:57:52 +020013281 writer->buffer = PyUnicode_New(newlen, maxchar);
13282 if (writer->buffer == NULL)
13283 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013284 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013285 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013286 if (writer->overallocate
13287 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13288 /* overallocate to limit the number of realloc() */
13289 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013290 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013291 if (newlen < writer->min_length)
13292 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013293
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013294 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013295 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013296 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013297 newbuffer = PyUnicode_New(newlen, maxchar);
13298 if (newbuffer == NULL)
13299 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013300 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13301 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013302 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013303 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013304 }
13305 else {
13306 newbuffer = resize_compact(writer->buffer, newlen);
13307 if (newbuffer == NULL)
13308 return -1;
13309 }
13310 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013311 }
13312 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013313 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013314 newbuffer = PyUnicode_New(writer->size, maxchar);
13315 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013316 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013317 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13318 writer->buffer, 0, writer->pos);
13319 Py_DECREF(writer->buffer);
13320 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013321 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013322 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013323 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013324
13325#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013326}
13327
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013328Py_LOCAL_INLINE(int)
13329_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013330{
13331 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13332 return -1;
13333 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13334 writer->pos++;
13335 return 0;
13336}
13337
13338int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013339_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13340{
13341 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13342}
13343
13344int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013345_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13346{
13347 Py_UCS4 maxchar;
13348 Py_ssize_t len;
13349
13350 if (PyUnicode_READY(str) == -1)
13351 return -1;
13352 len = PyUnicode_GET_LENGTH(str);
13353 if (len == 0)
13354 return 0;
13355 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13356 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013357 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013358 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013359 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013360 Py_INCREF(str);
13361 writer->buffer = str;
13362 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013363 writer->pos += len;
13364 return 0;
13365 }
13366 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13367 return -1;
13368 }
13369 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13370 str, 0, len);
13371 writer->pos += len;
13372 return 0;
13373}
13374
Victor Stinnere215d962012-10-06 23:03:36 +020013375int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013376_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13377 Py_ssize_t start, Py_ssize_t end)
13378{
13379 Py_UCS4 maxchar;
13380 Py_ssize_t len;
13381
13382 if (PyUnicode_READY(str) == -1)
13383 return -1;
13384
13385 assert(0 <= start);
13386 assert(end <= PyUnicode_GET_LENGTH(str));
13387 assert(start <= end);
13388
13389 if (end == 0)
13390 return 0;
13391
13392 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13393 return _PyUnicodeWriter_WriteStr(writer, str);
13394
13395 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13396 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13397 else
13398 maxchar = writer->maxchar;
13399 len = end - start;
13400
13401 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13402 return -1;
13403
13404 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13405 str, start, len);
13406 writer->pos += len;
13407 return 0;
13408}
13409
13410int
Victor Stinner4a587072013-11-19 12:54:53 +010013411_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13412 const char *ascii, Py_ssize_t len)
13413{
13414 if (len == -1)
13415 len = strlen(ascii);
13416
13417 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13418
13419 if (writer->buffer == NULL && !writer->overallocate) {
13420 PyObject *str;
13421
13422 str = _PyUnicode_FromASCII(ascii, len);
13423 if (str == NULL)
13424 return -1;
13425
13426 writer->readonly = 1;
13427 writer->buffer = str;
13428 _PyUnicodeWriter_Update(writer);
13429 writer->pos += len;
13430 return 0;
13431 }
13432
13433 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13434 return -1;
13435
13436 switch (writer->kind)
13437 {
13438 case PyUnicode_1BYTE_KIND:
13439 {
13440 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13441 Py_UCS1 *data = writer->data;
13442
13443 Py_MEMCPY(data + writer->pos, str, len);
13444 break;
13445 }
13446 case PyUnicode_2BYTE_KIND:
13447 {
13448 _PyUnicode_CONVERT_BYTES(
13449 Py_UCS1, Py_UCS2,
13450 ascii, ascii + len,
13451 (Py_UCS2 *)writer->data + writer->pos);
13452 break;
13453 }
13454 case PyUnicode_4BYTE_KIND:
13455 {
13456 _PyUnicode_CONVERT_BYTES(
13457 Py_UCS1, Py_UCS4,
13458 ascii, ascii + len,
13459 (Py_UCS4 *)writer->data + writer->pos);
13460 break;
13461 }
13462 default:
13463 assert(0);
13464 }
13465
13466 writer->pos += len;
13467 return 0;
13468}
13469
13470int
13471_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13472 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013473{
13474 Py_UCS4 maxchar;
13475
13476 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13477 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13478 return -1;
13479 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13480 writer->pos += len;
13481 return 0;
13482}
13483
Victor Stinnerd3f08822012-05-29 12:57:52 +020013484PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013485_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013486{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013487 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013488 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013489 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013490 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013491 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013492 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013493 str = writer->buffer;
13494 writer->buffer = NULL;
13495 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13496 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013497 }
13498 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13499 PyObject *newbuffer;
13500 newbuffer = resize_compact(writer->buffer, writer->pos);
13501 if (newbuffer == NULL) {
Serhiy Storchakadfe98a12014-02-09 13:46:20 +020013502 Py_CLEAR(writer->buffer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013503 return NULL;
13504 }
13505 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013506 }
Victor Stinner15a0bd32013-07-08 22:29:55 +020013507 str = writer->buffer;
13508 writer->buffer = NULL;
13509 assert(_PyUnicode_CheckConsistency(str, 1));
13510 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013511}
13512
Victor Stinnerd3f08822012-05-29 12:57:52 +020013513void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013514_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013515{
13516 Py_CLEAR(writer->buffer);
13517}
13518
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013519#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013520
13521PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013522 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013523\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013524Return a formatted version of S, using substitutions from args and kwargs.\n\
13525The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013526
Eric Smith27bbca62010-11-04 17:06:58 +000013527PyDoc_STRVAR(format_map__doc__,
13528 "S.format_map(mapping) -> str\n\
13529\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013530Return a formatted version of S, using substitutions from mapping.\n\
13531The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013532
Eric Smith4a7d76d2008-05-30 18:10:19 +000013533static PyObject *
13534unicode__format__(PyObject* self, PyObject* args)
13535{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013536 PyObject *format_spec;
13537 _PyUnicodeWriter writer;
13538 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013539
13540 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13541 return NULL;
13542
Victor Stinnerd3f08822012-05-29 12:57:52 +020013543 if (PyUnicode_READY(self) == -1)
13544 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013545 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013546 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13547 self, format_spec, 0,
13548 PyUnicode_GET_LENGTH(format_spec));
13549 if (ret == -1) {
13550 _PyUnicodeWriter_Dealloc(&writer);
13551 return NULL;
13552 }
13553 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013554}
13555
Eric Smith8c663262007-08-25 02:26:07 +000013556PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013557 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013558\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013559Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013560
13561static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013562unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013563{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013564 Py_ssize_t size;
13565
13566 /* If it's a compact object, account for base structure +
13567 character data. */
13568 if (PyUnicode_IS_COMPACT_ASCII(v))
13569 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13570 else if (PyUnicode_IS_COMPACT(v))
13571 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013572 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013573 else {
13574 /* If it is a two-block object, account for base object, and
13575 for character block if present. */
13576 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013577 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013578 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013579 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013580 }
13581 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013582 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013583 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013584 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013585 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013586 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013587
13588 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013589}
13590
13591PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013592 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013593
13594static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013595unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013596{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013597 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013598 if (!copy)
13599 return NULL;
13600 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013601}
13602
Guido van Rossumd57fd912000-03-10 22:53:23 +000013603static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013604 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013605 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013606 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13607 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013608 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13609 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013610 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013611 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13612 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13613 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013614 {"expandtabs", (PyCFunction) unicode_expandtabs,
13615 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013616 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013617 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013618 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13619 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13620 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013621 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013622 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13623 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13624 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013625 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013626 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013627 {"splitlines", (PyCFunction) unicode_splitlines,
13628 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013629 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013630 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13631 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13632 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13633 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13634 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13635 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13636 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13637 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13638 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13639 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13640 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13641 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13642 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13643 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013644 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013645 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013646 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013647 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013648 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013649 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013650 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013651 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013652#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013653 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013654 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013655#endif
13656
Benjamin Peterson14339b62009-01-31 16:36:08 +000013657 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013658 {NULL, NULL}
13659};
13660
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013661static PyObject *
13662unicode_mod(PyObject *v, PyObject *w)
13663{
Brian Curtindfc80e32011-08-10 20:28:54 -050013664 if (!PyUnicode_Check(v))
13665 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013666 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013667}
13668
13669static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013670 0, /*nb_add*/
13671 0, /*nb_subtract*/
13672 0, /*nb_multiply*/
13673 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013674};
13675
Guido van Rossumd57fd912000-03-10 22:53:23 +000013676static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013677 (lenfunc) unicode_length, /* sq_length */
13678 PyUnicode_Concat, /* sq_concat */
13679 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13680 (ssizeargfunc) unicode_getitem, /* sq_item */
13681 0, /* sq_slice */
13682 0, /* sq_ass_item */
13683 0, /* sq_ass_slice */
13684 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013685};
13686
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013687static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013688unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013689{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013690 if (PyUnicode_READY(self) == -1)
13691 return NULL;
13692
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013693 if (PyIndex_Check(item)) {
13694 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013695 if (i == -1 && PyErr_Occurred())
13696 return NULL;
13697 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013698 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013699 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013700 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013701 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013702 PyObject *result;
13703 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013704 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013705 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013707 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013708 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013709 return NULL;
13710 }
13711
13712 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013713 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013714 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013715 slicelength == PyUnicode_GET_LENGTH(self)) {
13716 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013717 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013718 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013719 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013720 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013721 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013722 src_kind = PyUnicode_KIND(self);
13723 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013724 if (!PyUnicode_IS_ASCII(self)) {
13725 kind_limit = kind_maxchar_limit(src_kind);
13726 max_char = 0;
13727 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13728 ch = PyUnicode_READ(src_kind, src_data, cur);
13729 if (ch > max_char) {
13730 max_char = ch;
13731 if (max_char >= kind_limit)
13732 break;
13733 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013734 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013735 }
Victor Stinner55c99112011-10-13 01:17:06 +020013736 else
13737 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013738 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013739 if (result == NULL)
13740 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013741 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013742 dest_data = PyUnicode_DATA(result);
13743
13744 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013745 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13746 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013747 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013748 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013749 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013750 } else {
13751 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13752 return NULL;
13753 }
13754}
13755
13756static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013757 (lenfunc)unicode_length, /* mp_length */
13758 (binaryfunc)unicode_subscript, /* mp_subscript */
13759 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013760};
13761
Guido van Rossumd57fd912000-03-10 22:53:23 +000013762
Guido van Rossumd57fd912000-03-10 22:53:23 +000013763/* Helpers for PyUnicode_Format() */
13764
Victor Stinnera47082312012-10-04 02:19:54 +020013765struct unicode_formatter_t {
13766 PyObject *args;
13767 int args_owned;
13768 Py_ssize_t arglen, argidx;
13769 PyObject *dict;
13770
13771 enum PyUnicode_Kind fmtkind;
13772 Py_ssize_t fmtcnt, fmtpos;
13773 void *fmtdata;
13774 PyObject *fmtstr;
13775
13776 _PyUnicodeWriter writer;
13777};
13778
13779struct unicode_format_arg_t {
13780 Py_UCS4 ch;
13781 int flags;
13782 Py_ssize_t width;
13783 int prec;
13784 int sign;
13785};
13786
Guido van Rossumd57fd912000-03-10 22:53:23 +000013787static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013788unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013789{
Victor Stinnera47082312012-10-04 02:19:54 +020013790 Py_ssize_t argidx = ctx->argidx;
13791
13792 if (argidx < ctx->arglen) {
13793 ctx->argidx++;
13794 if (ctx->arglen < 0)
13795 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013796 else
Victor Stinnera47082312012-10-04 02:19:54 +020013797 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013798 }
13799 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013800 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013801 return NULL;
13802}
13803
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013804/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013805
Victor Stinnera47082312012-10-04 02:19:54 +020013806/* Format a float into the writer if the writer is not NULL, or into *p_output
13807 otherwise.
13808
13809 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013810static int
Victor Stinnera47082312012-10-04 02:19:54 +020013811formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13812 PyObject **p_output,
13813 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013814{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013815 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013816 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013817 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013818 int prec;
13819 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013820
Guido van Rossumd57fd912000-03-10 22:53:23 +000013821 x = PyFloat_AsDouble(v);
13822 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013823 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013824
Victor Stinnera47082312012-10-04 02:19:54 +020013825 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013826 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013827 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013828
Victor Stinnera47082312012-10-04 02:19:54 +020013829 if (arg->flags & F_ALT)
13830 dtoa_flags = Py_DTSF_ALT;
13831 else
13832 dtoa_flags = 0;
13833 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013834 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013835 return -1;
13836 len = strlen(p);
13837 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013838 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013839 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013840 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013841 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013842 }
13843 else
13844 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013845 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013846 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013847}
13848
Victor Stinnerd0880d52012-04-27 23:40:13 +020013849/* formatlong() emulates the format codes d, u, o, x and X, and
13850 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13851 * Python's regular ints.
13852 * Return value: a new PyUnicodeObject*, or NULL if error.
13853 * The output string is of the form
13854 * "-"? ("0x" | "0X")? digit+
13855 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13856 * set in flags. The case of hex digits will be correct,
13857 * There will be at least prec digits, zero-filled on the left if
13858 * necessary to get that many.
13859 * val object to be converted
13860 * flags bitmask of format flags; only F_ALT is looked at
13861 * prec minimum number of digits; 0-fill on left if needed
13862 * type a character in [duoxX]; u acts the same as d
13863 *
13864 * CAUTION: o, x and X conversions on regular ints can never
13865 * produce a '-' sign, but can for Python's unbounded ints.
13866 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013867PyObject *
13868_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000013869{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013870 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013871 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013872 Py_ssize_t i;
13873 int sign; /* 1 if '-', else 0 */
13874 int len; /* number of characters */
13875 Py_ssize_t llen;
13876 int numdigits; /* len == numnondigits + numdigits */
13877 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013878
Victor Stinnerd0880d52012-04-27 23:40:13 +020013879 /* Avoid exceeding SSIZE_T_MAX */
13880 if (prec > INT_MAX-3) {
13881 PyErr_SetString(PyExc_OverflowError,
13882 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013883 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013884 }
13885
13886 assert(PyLong_Check(val));
13887
13888 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013889 default:
13890 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013891 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013892 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013893 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013894 /* int and int subclasses should print numerically when a numeric */
13895 /* format code is used (see issue18780) */
13896 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013897 break;
13898 case 'o':
13899 numnondigits = 2;
13900 result = PyNumber_ToBase(val, 8);
13901 break;
13902 case 'x':
13903 case 'X':
13904 numnondigits = 2;
13905 result = PyNumber_ToBase(val, 16);
13906 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013907 }
13908 if (!result)
13909 return NULL;
13910
13911 assert(unicode_modifiable(result));
13912 assert(PyUnicode_IS_READY(result));
13913 assert(PyUnicode_IS_ASCII(result));
13914
13915 /* To modify the string in-place, there can only be one reference. */
13916 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013917 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013918 PyErr_BadInternalCall();
13919 return NULL;
13920 }
13921 buf = PyUnicode_DATA(result);
13922 llen = PyUnicode_GET_LENGTH(result);
13923 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013924 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013925 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080013926 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013927 return NULL;
13928 }
13929 len = (int)llen;
13930 sign = buf[0] == '-';
13931 numnondigits += sign;
13932 numdigits = len - numnondigits;
13933 assert(numdigits > 0);
13934
13935 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013936 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013937 (type == 'o' || type == 'x' || type == 'X'))) {
13938 assert(buf[sign] == '0');
13939 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13940 buf[sign+1] == 'o');
13941 numnondigits -= 2;
13942 buf += 2;
13943 len -= 2;
13944 if (sign)
13945 buf[0] = '-';
13946 assert(len == numnondigits + numdigits);
13947 assert(numdigits > 0);
13948 }
13949
13950 /* Fill with leading zeroes to meet minimum width. */
13951 if (prec > numdigits) {
13952 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13953 numnondigits + prec);
13954 char *b1;
13955 if (!r1) {
13956 Py_DECREF(result);
13957 return NULL;
13958 }
13959 b1 = PyBytes_AS_STRING(r1);
13960 for (i = 0; i < numnondigits; ++i)
13961 *b1++ = *buf++;
13962 for (i = 0; i < prec - numdigits; i++)
13963 *b1++ = '0';
13964 for (i = 0; i < numdigits; i++)
13965 *b1++ = *buf++;
13966 *b1 = '\0';
13967 Py_DECREF(result);
13968 result = r1;
13969 buf = PyBytes_AS_STRING(result);
13970 len = numnondigits + prec;
13971 }
13972
13973 /* Fix up case for hex conversions. */
13974 if (type == 'X') {
13975 /* Need to convert all lower case letters to upper case.
13976 and need to convert 0x to 0X (and -0x to -0X). */
13977 for (i = 0; i < len; i++)
13978 if (buf[i] >= 'a' && buf[i] <= 'x')
13979 buf[i] -= 'a'-'A';
13980 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013981 if (!PyUnicode_Check(result)
13982 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013983 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013984 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013985 Py_DECREF(result);
13986 result = unicode;
13987 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013988 else if (len != PyUnicode_GET_LENGTH(result)) {
13989 if (PyUnicode_Resize(&result, len) < 0)
13990 Py_CLEAR(result);
13991 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013992 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013993}
13994
Ethan Furmandf3ed242014-01-05 06:50:30 -080013995/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020013996 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020013997 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020013998 * -1 and raise an exception on error */
13999static int
Victor Stinnera47082312012-10-04 02:19:54 +020014000mainformatlong(PyObject *v,
14001 struct unicode_format_arg_t *arg,
14002 PyObject **p_output,
14003 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014004{
14005 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014006 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014007
14008 if (!PyNumber_Check(v))
14009 goto wrongtype;
14010
Ethan Furman9ab74802014-03-21 06:38:46 -070014011 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014012 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014013 if (type == 'o' || type == 'x' || type == 'X') {
14014 iobj = PyNumber_Index(v);
14015 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014016 if (PyErr_ExceptionMatches(PyExc_TypeError))
14017 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014018 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014019 }
14020 }
14021 else {
14022 iobj = PyNumber_Long(v);
14023 if (iobj == NULL ) {
14024 if (PyErr_ExceptionMatches(PyExc_TypeError))
14025 goto wrongtype;
14026 return -1;
14027 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014028 }
14029 assert(PyLong_Check(iobj));
14030 }
14031 else {
14032 iobj = v;
14033 Py_INCREF(iobj);
14034 }
14035
14036 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014037 && arg->width == -1 && arg->prec == -1
14038 && !(arg->flags & (F_SIGN | F_BLANK))
14039 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014040 {
14041 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014042 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014043 int base;
14044
Victor Stinnera47082312012-10-04 02:19:54 +020014045 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014046 {
14047 default:
14048 assert(0 && "'type' not in [diuoxX]");
14049 case 'd':
14050 case 'i':
14051 case 'u':
14052 base = 10;
14053 break;
14054 case 'o':
14055 base = 8;
14056 break;
14057 case 'x':
14058 case 'X':
14059 base = 16;
14060 break;
14061 }
14062
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014063 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14064 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014065 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014066 }
14067 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014068 return 1;
14069 }
14070
Ethan Furmanb95b5612015-01-23 20:05:18 -080014071 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014072 Py_DECREF(iobj);
14073 if (res == NULL)
14074 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014075 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014076 return 0;
14077
14078wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014079 switch(type)
14080 {
14081 case 'o':
14082 case 'x':
14083 case 'X':
14084 PyErr_Format(PyExc_TypeError,
14085 "%%%c format: an integer is required, "
14086 "not %.200s",
14087 type, Py_TYPE(v)->tp_name);
14088 break;
14089 default:
14090 PyErr_Format(PyExc_TypeError,
14091 "%%%c format: a number is required, "
14092 "not %.200s",
14093 type, Py_TYPE(v)->tp_name);
14094 break;
14095 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014096 return -1;
14097}
14098
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014099static Py_UCS4
14100formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014101{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014102 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014103 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014104 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014105 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014106 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014107 goto onError;
14108 }
14109 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014110 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014111 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014112 /* make sure number is a type of integer */
14113 if (!PyLong_Check(v)) {
14114 iobj = PyNumber_Index(v);
14115 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014116 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014117 }
14118 v = iobj;
14119 Py_DECREF(iobj);
14120 }
14121 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014122 x = PyLong_AsLong(v);
14123 if (x == -1 && PyErr_Occurred())
14124 goto onError;
14125
Victor Stinner8faf8212011-12-08 22:14:11 +010014126 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014127 PyErr_SetString(PyExc_OverflowError,
14128 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014129 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014130 }
14131
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014132 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014133 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014134
Benjamin Peterson29060642009-01-31 22:14:21 +000014135 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014136 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014137 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014138 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014139}
14140
Victor Stinnera47082312012-10-04 02:19:54 +020014141/* Parse options of an argument: flags, width, precision.
14142 Handle also "%(name)" syntax.
14143
14144 Return 0 if the argument has been formatted into arg->str.
14145 Return 1 if the argument has been written into ctx->writer,
14146 Raise an exception and return -1 on error. */
14147static int
14148unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14149 struct unicode_format_arg_t *arg)
14150{
14151#define FORMAT_READ(ctx) \
14152 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14153
14154 PyObject *v;
14155
Victor Stinnera47082312012-10-04 02:19:54 +020014156 if (arg->ch == '(') {
14157 /* Get argument value from a dictionary. Example: "%(name)s". */
14158 Py_ssize_t keystart;
14159 Py_ssize_t keylen;
14160 PyObject *key;
14161 int pcount = 1;
14162
14163 if (ctx->dict == NULL) {
14164 PyErr_SetString(PyExc_TypeError,
14165 "format requires a mapping");
14166 return -1;
14167 }
14168 ++ctx->fmtpos;
14169 --ctx->fmtcnt;
14170 keystart = ctx->fmtpos;
14171 /* Skip over balanced parentheses */
14172 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14173 arg->ch = FORMAT_READ(ctx);
14174 if (arg->ch == ')')
14175 --pcount;
14176 else if (arg->ch == '(')
14177 ++pcount;
14178 ctx->fmtpos++;
14179 }
14180 keylen = ctx->fmtpos - keystart - 1;
14181 if (ctx->fmtcnt < 0 || pcount > 0) {
14182 PyErr_SetString(PyExc_ValueError,
14183 "incomplete format key");
14184 return -1;
14185 }
14186 key = PyUnicode_Substring(ctx->fmtstr,
14187 keystart, keystart + keylen);
14188 if (key == NULL)
14189 return -1;
14190 if (ctx->args_owned) {
14191 Py_DECREF(ctx->args);
14192 ctx->args_owned = 0;
14193 }
14194 ctx->args = PyObject_GetItem(ctx->dict, key);
14195 Py_DECREF(key);
14196 if (ctx->args == NULL)
14197 return -1;
14198 ctx->args_owned = 1;
14199 ctx->arglen = -1;
14200 ctx->argidx = -2;
14201 }
14202
14203 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014204 while (--ctx->fmtcnt >= 0) {
14205 arg->ch = FORMAT_READ(ctx);
14206 ctx->fmtpos++;
14207 switch (arg->ch) {
14208 case '-': arg->flags |= F_LJUST; continue;
14209 case '+': arg->flags |= F_SIGN; continue;
14210 case ' ': arg->flags |= F_BLANK; continue;
14211 case '#': arg->flags |= F_ALT; continue;
14212 case '0': arg->flags |= F_ZERO; continue;
14213 }
14214 break;
14215 }
14216
14217 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014218 if (arg->ch == '*') {
14219 v = unicode_format_getnextarg(ctx);
14220 if (v == NULL)
14221 return -1;
14222 if (!PyLong_Check(v)) {
14223 PyErr_SetString(PyExc_TypeError,
14224 "* wants int");
14225 return -1;
14226 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014227 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014228 if (arg->width == -1 && PyErr_Occurred())
14229 return -1;
14230 if (arg->width < 0) {
14231 arg->flags |= F_LJUST;
14232 arg->width = -arg->width;
14233 }
14234 if (--ctx->fmtcnt >= 0) {
14235 arg->ch = FORMAT_READ(ctx);
14236 ctx->fmtpos++;
14237 }
14238 }
14239 else if (arg->ch >= '0' && arg->ch <= '9') {
14240 arg->width = arg->ch - '0';
14241 while (--ctx->fmtcnt >= 0) {
14242 arg->ch = FORMAT_READ(ctx);
14243 ctx->fmtpos++;
14244 if (arg->ch < '0' || arg->ch > '9')
14245 break;
14246 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14247 mixing signed and unsigned comparison. Since arg->ch is between
14248 '0' and '9', casting to int is safe. */
14249 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14250 PyErr_SetString(PyExc_ValueError,
14251 "width too big");
14252 return -1;
14253 }
14254 arg->width = arg->width*10 + (arg->ch - '0');
14255 }
14256 }
14257
14258 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014259 if (arg->ch == '.') {
14260 arg->prec = 0;
14261 if (--ctx->fmtcnt >= 0) {
14262 arg->ch = FORMAT_READ(ctx);
14263 ctx->fmtpos++;
14264 }
14265 if (arg->ch == '*') {
14266 v = unicode_format_getnextarg(ctx);
14267 if (v == NULL)
14268 return -1;
14269 if (!PyLong_Check(v)) {
14270 PyErr_SetString(PyExc_TypeError,
14271 "* wants int");
14272 return -1;
14273 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014274 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014275 if (arg->prec == -1 && PyErr_Occurred())
14276 return -1;
14277 if (arg->prec < 0)
14278 arg->prec = 0;
14279 if (--ctx->fmtcnt >= 0) {
14280 arg->ch = FORMAT_READ(ctx);
14281 ctx->fmtpos++;
14282 }
14283 }
14284 else if (arg->ch >= '0' && arg->ch <= '9') {
14285 arg->prec = arg->ch - '0';
14286 while (--ctx->fmtcnt >= 0) {
14287 arg->ch = FORMAT_READ(ctx);
14288 ctx->fmtpos++;
14289 if (arg->ch < '0' || arg->ch > '9')
14290 break;
14291 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14292 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014293 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014294 return -1;
14295 }
14296 arg->prec = arg->prec*10 + (arg->ch - '0');
14297 }
14298 }
14299 }
14300
14301 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14302 if (ctx->fmtcnt >= 0) {
14303 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14304 if (--ctx->fmtcnt >= 0) {
14305 arg->ch = FORMAT_READ(ctx);
14306 ctx->fmtpos++;
14307 }
14308 }
14309 }
14310 if (ctx->fmtcnt < 0) {
14311 PyErr_SetString(PyExc_ValueError,
14312 "incomplete format");
14313 return -1;
14314 }
14315 return 0;
14316
14317#undef FORMAT_READ
14318}
14319
14320/* Format one argument. Supported conversion specifiers:
14321
14322 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014323 - "i", "d", "u": int or float
14324 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014325 - "e", "E", "f", "F", "g", "G": float
14326 - "c": int or str (1 character)
14327
Victor Stinner8dbd4212012-12-04 09:30:24 +010014328 When possible, the output is written directly into the Unicode writer
14329 (ctx->writer). A string is created when padding is required.
14330
Victor Stinnera47082312012-10-04 02:19:54 +020014331 Return 0 if the argument has been formatted into *p_str,
14332 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014333 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014334static int
14335unicode_format_arg_format(struct unicode_formatter_t *ctx,
14336 struct unicode_format_arg_t *arg,
14337 PyObject **p_str)
14338{
14339 PyObject *v;
14340 _PyUnicodeWriter *writer = &ctx->writer;
14341
14342 if (ctx->fmtcnt == 0)
14343 ctx->writer.overallocate = 0;
14344
14345 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014346 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014347 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014348 return 1;
14349 }
14350
14351 v = unicode_format_getnextarg(ctx);
14352 if (v == NULL)
14353 return -1;
14354
Victor Stinnera47082312012-10-04 02:19:54 +020014355
14356 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014357 case 's':
14358 case 'r':
14359 case 'a':
14360 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14361 /* Fast path */
14362 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14363 return -1;
14364 return 1;
14365 }
14366
14367 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14368 *p_str = v;
14369 Py_INCREF(*p_str);
14370 }
14371 else {
14372 if (arg->ch == 's')
14373 *p_str = PyObject_Str(v);
14374 else if (arg->ch == 'r')
14375 *p_str = PyObject_Repr(v);
14376 else
14377 *p_str = PyObject_ASCII(v);
14378 }
14379 break;
14380
14381 case 'i':
14382 case 'd':
14383 case 'u':
14384 case 'o':
14385 case 'x':
14386 case 'X':
14387 {
14388 int ret = mainformatlong(v, arg, p_str, writer);
14389 if (ret != 0)
14390 return ret;
14391 arg->sign = 1;
14392 break;
14393 }
14394
14395 case 'e':
14396 case 'E':
14397 case 'f':
14398 case 'F':
14399 case 'g':
14400 case 'G':
14401 if (arg->width == -1 && arg->prec == -1
14402 && !(arg->flags & (F_SIGN | F_BLANK)))
14403 {
14404 /* Fast path */
14405 if (formatfloat(v, arg, NULL, writer) == -1)
14406 return -1;
14407 return 1;
14408 }
14409
14410 arg->sign = 1;
14411 if (formatfloat(v, arg, p_str, NULL) == -1)
14412 return -1;
14413 break;
14414
14415 case 'c':
14416 {
14417 Py_UCS4 ch = formatchar(v);
14418 if (ch == (Py_UCS4) -1)
14419 return -1;
14420 if (arg->width == -1 && arg->prec == -1) {
14421 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014422 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014423 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014424 return 1;
14425 }
14426 *p_str = PyUnicode_FromOrdinal(ch);
14427 break;
14428 }
14429
14430 default:
14431 PyErr_Format(PyExc_ValueError,
14432 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014433 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014434 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14435 (int)arg->ch,
14436 ctx->fmtpos - 1);
14437 return -1;
14438 }
14439 if (*p_str == NULL)
14440 return -1;
14441 assert (PyUnicode_Check(*p_str));
14442 return 0;
14443}
14444
14445static int
14446unicode_format_arg_output(struct unicode_formatter_t *ctx,
14447 struct unicode_format_arg_t *arg,
14448 PyObject *str)
14449{
14450 Py_ssize_t len;
14451 enum PyUnicode_Kind kind;
14452 void *pbuf;
14453 Py_ssize_t pindex;
14454 Py_UCS4 signchar;
14455 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014456 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014457 Py_ssize_t sublen;
14458 _PyUnicodeWriter *writer = &ctx->writer;
14459 Py_UCS4 fill;
14460
14461 fill = ' ';
14462 if (arg->sign && arg->flags & F_ZERO)
14463 fill = '0';
14464
14465 if (PyUnicode_READY(str) == -1)
14466 return -1;
14467
14468 len = PyUnicode_GET_LENGTH(str);
14469 if ((arg->width == -1 || arg->width <= len)
14470 && (arg->prec == -1 || arg->prec >= len)
14471 && !(arg->flags & (F_SIGN | F_BLANK)))
14472 {
14473 /* Fast path */
14474 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14475 return -1;
14476 return 0;
14477 }
14478
14479 /* Truncate the string for "s", "r" and "a" formats
14480 if the precision is set */
14481 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14482 if (arg->prec >= 0 && len > arg->prec)
14483 len = arg->prec;
14484 }
14485
14486 /* Adjust sign and width */
14487 kind = PyUnicode_KIND(str);
14488 pbuf = PyUnicode_DATA(str);
14489 pindex = 0;
14490 signchar = '\0';
14491 if (arg->sign) {
14492 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14493 if (ch == '-' || ch == '+') {
14494 signchar = ch;
14495 len--;
14496 pindex++;
14497 }
14498 else if (arg->flags & F_SIGN)
14499 signchar = '+';
14500 else if (arg->flags & F_BLANK)
14501 signchar = ' ';
14502 else
14503 arg->sign = 0;
14504 }
14505 if (arg->width < len)
14506 arg->width = len;
14507
14508 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014509 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014510 if (!(arg->flags & F_LJUST)) {
14511 if (arg->sign) {
14512 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014513 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014514 }
14515 else {
14516 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014517 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014518 }
14519 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014520 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14521 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014522 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014523 }
14524
Victor Stinnera47082312012-10-04 02:19:54 +020014525 buflen = arg->width;
14526 if (arg->sign && len == arg->width)
14527 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014528 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014529 return -1;
14530
14531 /* Write the sign if needed */
14532 if (arg->sign) {
14533 if (fill != ' ') {
14534 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14535 writer->pos += 1;
14536 }
14537 if (arg->width > len)
14538 arg->width--;
14539 }
14540
14541 /* Write the numeric prefix for "x", "X" and "o" formats
14542 if the alternate form is used.
14543 For example, write "0x" for the "%#x" format. */
14544 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14545 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14546 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14547 if (fill != ' ') {
14548 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14549 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14550 writer->pos += 2;
14551 pindex += 2;
14552 }
14553 arg->width -= 2;
14554 if (arg->width < 0)
14555 arg->width = 0;
14556 len -= 2;
14557 }
14558
14559 /* Pad left with the fill character if needed */
14560 if (arg->width > len && !(arg->flags & F_LJUST)) {
14561 sublen = arg->width - len;
14562 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14563 writer->pos += sublen;
14564 arg->width = len;
14565 }
14566
14567 /* If padding with spaces: write sign if needed and/or numeric prefix if
14568 the alternate form is used */
14569 if (fill == ' ') {
14570 if (arg->sign) {
14571 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14572 writer->pos += 1;
14573 }
14574 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14575 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14576 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14577 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14578 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14579 writer->pos += 2;
14580 pindex += 2;
14581 }
14582 }
14583
14584 /* Write characters */
14585 if (len) {
14586 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14587 str, pindex, len);
14588 writer->pos += len;
14589 }
14590
14591 /* Pad right with the fill character if needed */
14592 if (arg->width > len) {
14593 sublen = arg->width - len;
14594 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14595 writer->pos += sublen;
14596 }
14597 return 0;
14598}
14599
14600/* Helper of PyUnicode_Format(): format one arg.
14601 Return 0 on success, raise an exception and return -1 on error. */
14602static int
14603unicode_format_arg(struct unicode_formatter_t *ctx)
14604{
14605 struct unicode_format_arg_t arg;
14606 PyObject *str;
14607 int ret;
14608
Victor Stinner8dbd4212012-12-04 09:30:24 +010014609 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14610 arg.flags = 0;
14611 arg.width = -1;
14612 arg.prec = -1;
14613 arg.sign = 0;
14614 str = NULL;
14615
Victor Stinnera47082312012-10-04 02:19:54 +020014616 ret = unicode_format_arg_parse(ctx, &arg);
14617 if (ret == -1)
14618 return -1;
14619
14620 ret = unicode_format_arg_format(ctx, &arg, &str);
14621 if (ret == -1)
14622 return -1;
14623
14624 if (ret != 1) {
14625 ret = unicode_format_arg_output(ctx, &arg, str);
14626 Py_DECREF(str);
14627 if (ret == -1)
14628 return -1;
14629 }
14630
14631 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14632 PyErr_SetString(PyExc_TypeError,
14633 "not all arguments converted during string formatting");
14634 return -1;
14635 }
14636 return 0;
14637}
14638
Alexander Belopolsky40018472011-02-26 01:02:56 +000014639PyObject *
14640PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014641{
Victor Stinnera47082312012-10-04 02:19:54 +020014642 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014643
Guido van Rossumd57fd912000-03-10 22:53:23 +000014644 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014645 PyErr_BadInternalCall();
14646 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014647 }
Victor Stinnera47082312012-10-04 02:19:54 +020014648
14649 ctx.fmtstr = PyUnicode_FromObject(format);
14650 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014651 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014652 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14653 Py_DECREF(ctx.fmtstr);
14654 return NULL;
14655 }
14656 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14657 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14658 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14659 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014660
Victor Stinner8f674cc2013-04-17 23:02:17 +020014661 _PyUnicodeWriter_Init(&ctx.writer);
14662 ctx.writer.min_length = ctx.fmtcnt + 100;
14663 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014664
Guido van Rossumd57fd912000-03-10 22:53:23 +000014665 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014666 ctx.arglen = PyTuple_Size(args);
14667 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014668 }
14669 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014670 ctx.arglen = -1;
14671 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014672 }
Victor Stinnera47082312012-10-04 02:19:54 +020014673 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014674 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014675 ctx.dict = args;
14676 else
14677 ctx.dict = NULL;
14678 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014679
Victor Stinnera47082312012-10-04 02:19:54 +020014680 while (--ctx.fmtcnt >= 0) {
14681 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014682 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014683
14684 nonfmtpos = ctx.fmtpos++;
14685 while (ctx.fmtcnt >= 0 &&
14686 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14687 ctx.fmtpos++;
14688 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014689 }
Victor Stinnera47082312012-10-04 02:19:54 +020014690 if (ctx.fmtcnt < 0) {
14691 ctx.fmtpos--;
14692 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014693 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014694
Victor Stinnercfc4c132013-04-03 01:48:39 +020014695 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14696 nonfmtpos, ctx.fmtpos) < 0)
14697 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014698 }
14699 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014700 ctx.fmtpos++;
14701 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014702 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014703 }
14704 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014705
Victor Stinnera47082312012-10-04 02:19:54 +020014706 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014707 PyErr_SetString(PyExc_TypeError,
14708 "not all arguments converted during string formatting");
14709 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014710 }
14711
Victor Stinnera47082312012-10-04 02:19:54 +020014712 if (ctx.args_owned) {
14713 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014714 }
Victor Stinnera47082312012-10-04 02:19:54 +020014715 Py_DECREF(ctx.fmtstr);
14716 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014717
Benjamin Peterson29060642009-01-31 22:14:21 +000014718 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014719 Py_DECREF(ctx.fmtstr);
14720 _PyUnicodeWriter_Dealloc(&ctx.writer);
14721 if (ctx.args_owned) {
14722 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014723 }
14724 return NULL;
14725}
14726
Jeremy Hylton938ace62002-07-17 16:30:39 +000014727static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014728unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14729
Tim Peters6d6c1a32001-08-02 04:15:00 +000014730static PyObject *
14731unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14732{
Benjamin Peterson29060642009-01-31 22:14:21 +000014733 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014734 static char *kwlist[] = {"object", "encoding", "errors", 0};
14735 char *encoding = NULL;
14736 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014737
Benjamin Peterson14339b62009-01-31 16:36:08 +000014738 if (type != &PyUnicode_Type)
14739 return unicode_subtype_new(type, args, kwds);
14740 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014741 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014742 return NULL;
14743 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014744 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014745 if (encoding == NULL && errors == NULL)
14746 return PyObject_Str(x);
14747 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014748 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014749}
14750
Guido van Rossume023fe02001-08-30 03:12:59 +000014751static PyObject *
14752unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14753{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014754 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014755 Py_ssize_t length, char_size;
14756 int share_wstr, share_utf8;
14757 unsigned int kind;
14758 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014759
Benjamin Peterson14339b62009-01-31 16:36:08 +000014760 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014761
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014762 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014763 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014764 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014765 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014766 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014767 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014768 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014769 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014770
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014771 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014772 if (self == NULL) {
14773 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014774 return NULL;
14775 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014776 kind = PyUnicode_KIND(unicode);
14777 length = PyUnicode_GET_LENGTH(unicode);
14778
14779 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014780#ifdef Py_DEBUG
14781 _PyUnicode_HASH(self) = -1;
14782#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014783 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014784#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014785 _PyUnicode_STATE(self).interned = 0;
14786 _PyUnicode_STATE(self).kind = kind;
14787 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014788 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014789 _PyUnicode_STATE(self).ready = 1;
14790 _PyUnicode_WSTR(self) = NULL;
14791 _PyUnicode_UTF8_LENGTH(self) = 0;
14792 _PyUnicode_UTF8(self) = NULL;
14793 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014794 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014795
14796 share_utf8 = 0;
14797 share_wstr = 0;
14798 if (kind == PyUnicode_1BYTE_KIND) {
14799 char_size = 1;
14800 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14801 share_utf8 = 1;
14802 }
14803 else if (kind == PyUnicode_2BYTE_KIND) {
14804 char_size = 2;
14805 if (sizeof(wchar_t) == 2)
14806 share_wstr = 1;
14807 }
14808 else {
14809 assert(kind == PyUnicode_4BYTE_KIND);
14810 char_size = 4;
14811 if (sizeof(wchar_t) == 4)
14812 share_wstr = 1;
14813 }
14814
14815 /* Ensure we won't overflow the length. */
14816 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14817 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014818 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014819 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014820 data = PyObject_MALLOC((length + 1) * char_size);
14821 if (data == NULL) {
14822 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014823 goto onError;
14824 }
14825
Victor Stinnerc3c74152011-10-02 20:39:55 +020014826 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014827 if (share_utf8) {
14828 _PyUnicode_UTF8_LENGTH(self) = length;
14829 _PyUnicode_UTF8(self) = data;
14830 }
14831 if (share_wstr) {
14832 _PyUnicode_WSTR_LENGTH(self) = length;
14833 _PyUnicode_WSTR(self) = (wchar_t *)data;
14834 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014835
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014836 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014837 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014838 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014839#ifdef Py_DEBUG
14840 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14841#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014842 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014843 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014844
14845onError:
14846 Py_DECREF(unicode);
14847 Py_DECREF(self);
14848 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014849}
14850
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014851PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014852"str(object='') -> str\n\
14853str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014854\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014855Create a new string object from the given object. If encoding or\n\
14856errors is specified, then the object must expose a data buffer\n\
14857that will be decoded using the given encoding and error handler.\n\
14858Otherwise, returns the result of object.__str__() (if defined)\n\
14859or repr(object).\n\
14860encoding defaults to sys.getdefaultencoding().\n\
14861errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014862
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014863static PyObject *unicode_iter(PyObject *seq);
14864
Guido van Rossumd57fd912000-03-10 22:53:23 +000014865PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014866 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014867 "str", /* tp_name */
14868 sizeof(PyUnicodeObject), /* tp_size */
14869 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014870 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014871 (destructor)unicode_dealloc, /* tp_dealloc */
14872 0, /* tp_print */
14873 0, /* tp_getattr */
14874 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014875 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014876 unicode_repr, /* tp_repr */
14877 &unicode_as_number, /* tp_as_number */
14878 &unicode_as_sequence, /* tp_as_sequence */
14879 &unicode_as_mapping, /* tp_as_mapping */
14880 (hashfunc) unicode_hash, /* tp_hash*/
14881 0, /* tp_call*/
14882 (reprfunc) unicode_str, /* tp_str */
14883 PyObject_GenericGetAttr, /* tp_getattro */
14884 0, /* tp_setattro */
14885 0, /* tp_as_buffer */
14886 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014887 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014888 unicode_doc, /* tp_doc */
14889 0, /* tp_traverse */
14890 0, /* tp_clear */
14891 PyUnicode_RichCompare, /* tp_richcompare */
14892 0, /* tp_weaklistoffset */
14893 unicode_iter, /* tp_iter */
14894 0, /* tp_iternext */
14895 unicode_methods, /* tp_methods */
14896 0, /* tp_members */
14897 0, /* tp_getset */
14898 &PyBaseObject_Type, /* tp_base */
14899 0, /* tp_dict */
14900 0, /* tp_descr_get */
14901 0, /* tp_descr_set */
14902 0, /* tp_dictoffset */
14903 0, /* tp_init */
14904 0, /* tp_alloc */
14905 unicode_new, /* tp_new */
14906 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014907};
14908
14909/* Initialize the Unicode implementation */
14910
Victor Stinner3a50e702011-10-18 21:21:00 +020014911int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014912{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014913 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014914 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014915 0x000A, /* LINE FEED */
14916 0x000D, /* CARRIAGE RETURN */
14917 0x001C, /* FILE SEPARATOR */
14918 0x001D, /* GROUP SEPARATOR */
14919 0x001E, /* RECORD SEPARATOR */
14920 0x0085, /* NEXT LINE */
14921 0x2028, /* LINE SEPARATOR */
14922 0x2029, /* PARAGRAPH SEPARATOR */
14923 };
14924
Fred Drakee4315f52000-05-09 19:53:39 +000014925 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014926 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014927 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014928 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014929 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014930
Guido van Rossumcacfc072002-05-24 19:01:59 +000014931 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014932 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014933
14934 /* initialize the linebreak bloom filter */
14935 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014936 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014937 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014938
Christian Heimes26532f72013-07-20 14:57:16 +020014939 if (PyType_Ready(&EncodingMapType) < 0)
14940 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014941
Benjamin Petersonc4311282012-10-30 23:21:10 -040014942 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14943 Py_FatalError("Can't initialize field name iterator type");
14944
14945 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14946 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014947
Victor Stinner3a50e702011-10-18 21:21:00 +020014948 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014949}
14950
14951/* Finalize the Unicode implementation */
14952
Christian Heimesa156e092008-02-16 07:38:31 +000014953int
14954PyUnicode_ClearFreeList(void)
14955{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014956 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014957}
14958
Guido van Rossumd57fd912000-03-10 22:53:23 +000014959void
Thomas Wouters78890102000-07-22 19:25:51 +000014960_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014961{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014962 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014963
Serhiy Storchaka05997252013-01-26 12:14:02 +020014964 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014965
Serhiy Storchaka05997252013-01-26 12:14:02 +020014966 for (i = 0; i < 256; i++)
14967 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014968 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014969 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014970}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014971
Walter Dörwald16807132007-05-25 13:52:07 +000014972void
14973PyUnicode_InternInPlace(PyObject **p)
14974{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020014975 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014976 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014977#ifdef Py_DEBUG
14978 assert(s != NULL);
14979 assert(_PyUnicode_CHECK(s));
14980#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014981 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014982 return;
14983#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014984 /* If it's a subclass, we don't really know what putting
14985 it in the interned dict might do. */
14986 if (!PyUnicode_CheckExact(s))
14987 return;
14988 if (PyUnicode_CHECK_INTERNED(s))
14989 return;
14990 if (interned == NULL) {
14991 interned = PyDict_New();
14992 if (interned == NULL) {
14993 PyErr_Clear(); /* Don't leave an exception */
14994 return;
14995 }
14996 }
14997 /* It might be that the GetItem call fails even
14998 though the key is present in the dictionary,
14999 namely when this happens during a stack overflow. */
15000 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010015001 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015002 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000015003
Victor Stinnerf0335102013-04-14 19:13:03 +020015004 if (t) {
15005 Py_INCREF(t);
15006 Py_DECREF(*p);
15007 *p = t;
15008 return;
15009 }
Walter Dörwald16807132007-05-25 13:52:07 +000015010
Benjamin Peterson14339b62009-01-31 16:36:08 +000015011 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010015012 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015013 PyErr_Clear();
15014 PyThreadState_GET()->recursion_critical = 0;
15015 return;
15016 }
15017 PyThreadState_GET()->recursion_critical = 0;
15018 /* The two references in interned are not counted by refcnt.
15019 The deallocator will take care of this */
15020 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015021 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015022}
15023
15024void
15025PyUnicode_InternImmortal(PyObject **p)
15026{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015027 PyUnicode_InternInPlace(p);
15028 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015029 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015030 Py_INCREF(*p);
15031 }
Walter Dörwald16807132007-05-25 13:52:07 +000015032}
15033
15034PyObject *
15035PyUnicode_InternFromString(const char *cp)
15036{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015037 PyObject *s = PyUnicode_FromString(cp);
15038 if (s == NULL)
15039 return NULL;
15040 PyUnicode_InternInPlace(&s);
15041 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015042}
15043
Alexander Belopolsky40018472011-02-26 01:02:56 +000015044void
15045_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015046{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015047 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015048 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015049 Py_ssize_t i, n;
15050 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015051
Benjamin Peterson14339b62009-01-31 16:36:08 +000015052 if (interned == NULL || !PyDict_Check(interned))
15053 return;
15054 keys = PyDict_Keys(interned);
15055 if (keys == NULL || !PyList_Check(keys)) {
15056 PyErr_Clear();
15057 return;
15058 }
Walter Dörwald16807132007-05-25 13:52:07 +000015059
Benjamin Peterson14339b62009-01-31 16:36:08 +000015060 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15061 detector, interned unicode strings are not forcibly deallocated;
15062 rather, we give them their stolen references back, and then clear
15063 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015064
Benjamin Peterson14339b62009-01-31 16:36:08 +000015065 n = PyList_GET_SIZE(keys);
15066 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015067 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015068 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015069 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015070 if (PyUnicode_READY(s) == -1) {
15071 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015072 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015073 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015074 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015075 case SSTATE_NOT_INTERNED:
15076 /* XXX Shouldn't happen */
15077 break;
15078 case SSTATE_INTERNED_IMMORTAL:
15079 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015080 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015081 break;
15082 case SSTATE_INTERNED_MORTAL:
15083 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015084 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015085 break;
15086 default:
15087 Py_FatalError("Inconsistent interned string state.");
15088 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015089 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015090 }
15091 fprintf(stderr, "total size of all interned strings: "
15092 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15093 "mortal/immortal\n", mortal_size, immortal_size);
15094 Py_DECREF(keys);
15095 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015096 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015097}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015098
15099
15100/********************* Unicode Iterator **************************/
15101
15102typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015103 PyObject_HEAD
15104 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015105 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015106} unicodeiterobject;
15107
15108static void
15109unicodeiter_dealloc(unicodeiterobject *it)
15110{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015111 _PyObject_GC_UNTRACK(it);
15112 Py_XDECREF(it->it_seq);
15113 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015114}
15115
15116static int
15117unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15118{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015119 Py_VISIT(it->it_seq);
15120 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015121}
15122
15123static PyObject *
15124unicodeiter_next(unicodeiterobject *it)
15125{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015126 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015127
Benjamin Peterson14339b62009-01-31 16:36:08 +000015128 assert(it != NULL);
15129 seq = it->it_seq;
15130 if (seq == NULL)
15131 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015132 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015134 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15135 int kind = PyUnicode_KIND(seq);
15136 void *data = PyUnicode_DATA(seq);
15137 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15138 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015139 if (item != NULL)
15140 ++it->it_index;
15141 return item;
15142 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015143
Benjamin Peterson14339b62009-01-31 16:36:08 +000015144 Py_DECREF(seq);
15145 it->it_seq = NULL;
15146 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015147}
15148
15149static PyObject *
15150unicodeiter_len(unicodeiterobject *it)
15151{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015152 Py_ssize_t len = 0;
15153 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015154 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015155 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015156}
15157
15158PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15159
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015160static PyObject *
15161unicodeiter_reduce(unicodeiterobject *it)
15162{
15163 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015164 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015165 it->it_seq, it->it_index);
15166 } else {
15167 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15168 if (u == NULL)
15169 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015170 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015171 }
15172}
15173
15174PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15175
15176static PyObject *
15177unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15178{
15179 Py_ssize_t index = PyLong_AsSsize_t(state);
15180 if (index == -1 && PyErr_Occurred())
15181 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015182 if (it->it_seq != NULL) {
15183 if (index < 0)
15184 index = 0;
15185 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15186 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15187 it->it_index = index;
15188 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015189 Py_RETURN_NONE;
15190}
15191
15192PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15193
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015194static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015195 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015196 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015197 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15198 reduce_doc},
15199 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15200 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015201 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015202};
15203
15204PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015205 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15206 "str_iterator", /* tp_name */
15207 sizeof(unicodeiterobject), /* tp_basicsize */
15208 0, /* tp_itemsize */
15209 /* methods */
15210 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15211 0, /* tp_print */
15212 0, /* tp_getattr */
15213 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015214 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015215 0, /* tp_repr */
15216 0, /* tp_as_number */
15217 0, /* tp_as_sequence */
15218 0, /* tp_as_mapping */
15219 0, /* tp_hash */
15220 0, /* tp_call */
15221 0, /* tp_str */
15222 PyObject_GenericGetAttr, /* tp_getattro */
15223 0, /* tp_setattro */
15224 0, /* tp_as_buffer */
15225 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15226 0, /* tp_doc */
15227 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15228 0, /* tp_clear */
15229 0, /* tp_richcompare */
15230 0, /* tp_weaklistoffset */
15231 PyObject_SelfIter, /* tp_iter */
15232 (iternextfunc)unicodeiter_next, /* tp_iternext */
15233 unicodeiter_methods, /* tp_methods */
15234 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015235};
15236
15237static PyObject *
15238unicode_iter(PyObject *seq)
15239{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015240 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015241
Benjamin Peterson14339b62009-01-31 16:36:08 +000015242 if (!PyUnicode_Check(seq)) {
15243 PyErr_BadInternalCall();
15244 return NULL;
15245 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015246 if (PyUnicode_READY(seq) == -1)
15247 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015248 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15249 if (it == NULL)
15250 return NULL;
15251 it->it_index = 0;
15252 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015253 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015254 _PyObject_GC_TRACK(it);
15255 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015256}
15257
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015258
15259size_t
15260Py_UNICODE_strlen(const Py_UNICODE *u)
15261{
15262 int res = 0;
15263 while(*u++)
15264 res++;
15265 return res;
15266}
15267
15268Py_UNICODE*
15269Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15270{
15271 Py_UNICODE *u = s1;
15272 while ((*u++ = *s2++));
15273 return s1;
15274}
15275
15276Py_UNICODE*
15277Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15278{
15279 Py_UNICODE *u = s1;
15280 while ((*u++ = *s2++))
15281 if (n-- == 0)
15282 break;
15283 return s1;
15284}
15285
15286Py_UNICODE*
15287Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15288{
15289 Py_UNICODE *u1 = s1;
15290 u1 += Py_UNICODE_strlen(u1);
15291 Py_UNICODE_strcpy(u1, s2);
15292 return s1;
15293}
15294
15295int
15296Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15297{
15298 while (*s1 && *s2 && *s1 == *s2)
15299 s1++, s2++;
15300 if (*s1 && *s2)
15301 return (*s1 < *s2) ? -1 : +1;
15302 if (*s1)
15303 return 1;
15304 if (*s2)
15305 return -1;
15306 return 0;
15307}
15308
15309int
15310Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15311{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015312 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015313 for (; n != 0; n--) {
15314 u1 = *s1;
15315 u2 = *s2;
15316 if (u1 != u2)
15317 return (u1 < u2) ? -1 : +1;
15318 if (u1 == '\0')
15319 return 0;
15320 s1++;
15321 s2++;
15322 }
15323 return 0;
15324}
15325
15326Py_UNICODE*
15327Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15328{
15329 const Py_UNICODE *p;
15330 for (p = s; *p; p++)
15331 if (*p == c)
15332 return (Py_UNICODE*)p;
15333 return NULL;
15334}
15335
15336Py_UNICODE*
15337Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15338{
15339 const Py_UNICODE *p;
15340 p = s + Py_UNICODE_strlen(s);
15341 while (p != s) {
15342 p--;
15343 if (*p == c)
15344 return (Py_UNICODE*)p;
15345 }
15346 return NULL;
15347}
Victor Stinner331ea922010-08-10 16:37:20 +000015348
Victor Stinner71133ff2010-09-01 23:43:53 +000015349Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015350PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015351{
Victor Stinner577db2c2011-10-11 22:12:48 +020015352 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015353 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015354
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015355 if (!PyUnicode_Check(unicode)) {
15356 PyErr_BadArgument();
15357 return NULL;
15358 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015359 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015360 if (u == NULL)
15361 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015362 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015363 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015364 PyErr_NoMemory();
15365 return NULL;
15366 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015367 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015368 size *= sizeof(Py_UNICODE);
15369 copy = PyMem_Malloc(size);
15370 if (copy == NULL) {
15371 PyErr_NoMemory();
15372 return NULL;
15373 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015374 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015375 return copy;
15376}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015377
Georg Brandl66c221e2010-10-14 07:04:07 +000015378/* A _string module, to export formatter_parser and formatter_field_name_split
15379 to the string.Formatter class implemented in Python. */
15380
15381static PyMethodDef _string_methods[] = {
15382 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15383 METH_O, PyDoc_STR("split the argument as a field name")},
15384 {"formatter_parser", (PyCFunction) formatter_parser,
15385 METH_O, PyDoc_STR("parse the argument as a format string")},
15386 {NULL, NULL}
15387};
15388
15389static struct PyModuleDef _string_module = {
15390 PyModuleDef_HEAD_INIT,
15391 "_string",
15392 PyDoc_STR("string helper module"),
15393 0,
15394 _string_methods,
15395 NULL,
15396 NULL,
15397 NULL,
15398 NULL
15399};
15400
15401PyMODINIT_FUNC
15402PyInit__string(void)
15403{
15404 return PyModule_Create(&_string_module);
15405}
15406
15407
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015408#ifdef __cplusplus
15409}
15410#endif