blob: c5b35e1194602cfa247bfce1984529f41363ec0f [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000043#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050044#include "bytes_methods.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000045
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000046#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000047#include <windows.h>
48#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000049
Larry Hastings61272b72014-01-07 12:41:53 -080050/*[clinic input]
Larry Hastingsc2047262014-01-25 20:43:29 -080051class str "PyUnicodeObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080052[clinic start generated code]*/
Larry Hastings581ee362014-01-28 05:00:08 -080053/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080054
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000055/* --- Globals ------------------------------------------------------------
56
Serhiy Storchaka05997252013-01-26 12:14:02 +020057NOTE: In the interpreter's initialization phase, some globals are currently
58 initialized dynamically as needed. In the process Unicode objects may
59 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000060
61*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000062
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000063
64#ifdef __cplusplus
65extern "C" {
66#endif
67
Victor Stinner8faf8212011-12-08 22:14:11 +010068/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
69#define MAX_UNICODE 0x10ffff
70
Victor Stinner910337b2011-10-03 03:20:16 +020071#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020072# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020073#else
74# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
75#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +020076
Victor Stinnere90fe6a2011-10-01 16:48:13 +020077#define _PyUnicode_UTF8(op) \
78 (((PyCompactUnicodeObject*)(op))->utf8)
79#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020080 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020081 assert(PyUnicode_IS_READY(op)), \
82 PyUnicode_IS_COMPACT_ASCII(op) ? \
83 ((char*)((PyASCIIObject*)(op) + 1)) : \
84 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +020085#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020086 (((PyCompactUnicodeObject*)(op))->utf8_length)
87#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +020088 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +020089 assert(PyUnicode_IS_READY(op)), \
90 PyUnicode_IS_COMPACT_ASCII(op) ? \
91 ((PyASCIIObject*)(op))->length : \
92 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +020093#define _PyUnicode_WSTR(op) \
94 (((PyASCIIObject*)(op))->wstr)
95#define _PyUnicode_WSTR_LENGTH(op) \
96 (((PyCompactUnicodeObject*)(op))->wstr_length)
97#define _PyUnicode_LENGTH(op) \
98 (((PyASCIIObject *)(op))->length)
99#define _PyUnicode_STATE(op) \
100 (((PyASCIIObject *)(op))->state)
101#define _PyUnicode_HASH(op) \
102 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200103#define _PyUnicode_KIND(op) \
104 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200105 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200106#define _PyUnicode_GET_LENGTH(op) \
107 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200108 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200109#define _PyUnicode_DATA_ANY(op) \
110 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200111
Victor Stinner910337b2011-10-03 03:20:16 +0200112#undef PyUnicode_READY
113#define PyUnicode_READY(op) \
114 (assert(_PyUnicode_CHECK(op)), \
115 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200116 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100117 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200118
Victor Stinnerc379ead2011-10-03 12:52:27 +0200119#define _PyUnicode_SHARE_UTF8(op) \
120 (assert(_PyUnicode_CHECK(op)), \
121 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
122 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op) \
124 (assert(_PyUnicode_CHECK(op)), \
125 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
Victor Stinner829c0ad2011-10-03 01:08:02 +0200127/* true if the Unicode object has an allocated UTF-8 memory block
128 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200130 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200131 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200132 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
Victor Stinner03490912011-10-03 23:45:12 +0200134/* true if the Unicode object has an allocated wstr memory block
135 (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200137 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200138 (!PyUnicode_IS_READY(op) || \
139 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
Victor Stinner910337b2011-10-03 03:20:16 +0200141/* Generic helper macro to convert characters of different types.
142 from_type and to_type have to be valid type names, begin and end
143 are pointers to the source characters which should be of type
144 "from_type *". to is a pointer of type "to_type *" and points to the
145 buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100148 to_type *_to = (to_type *)(to); \
149 const from_type *_iter = (from_type *)(begin); \
150 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200151 Py_ssize_t n = (_end) - (_iter); \
152 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200153 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200154 while (_iter < (_unrolled_end)) { \
155 _to[0] = (to_type) _iter[0]; \
156 _to[1] = (to_type) _iter[1]; \
157 _to[2] = (to_type) _iter[2]; \
158 _to[3] = (to_type) _iter[3]; \
159 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200160 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200161 while (_iter < (_end)) \
162 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200163 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200164
Walter Dörwald16807132007-05-25 13:52:07 +0000165/* This dictionary holds all interned unicode strings. Note that references
166 to strings in this dictionary are *not* counted in the string's ob_refcnt.
167 When the interned string reaches a refcnt of 0 the string deallocation
168 function will delete the reference from this dictionary.
169
170 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000171 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000172*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200173static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000174
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000175/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200176static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200177
Serhiy Storchaka678db842013-01-26 12:16:36 +0200178#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200179 do { \
180 if (unicode_empty != NULL) \
181 Py_INCREF(unicode_empty); \
182 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200183 unicode_empty = PyUnicode_New(0, 0); \
184 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200185 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200186 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
187 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200188 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200189 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000190
Serhiy Storchaka678db842013-01-26 12:16:36 +0200191#define _Py_RETURN_UNICODE_EMPTY() \
192 do { \
193 _Py_INCREF_UNICODE_EMPTY(); \
194 return unicode_empty; \
195 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000196
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200197/* Forward declaration */
198Py_LOCAL_INLINE(int)
199_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
200
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200201/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200202static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200203
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000204/* Single character Unicode strings in the Latin-1 range are being
205 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200206static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000207
Christian Heimes190d79e2008-01-30 11:58:22 +0000208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000210 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000211/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000212/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000213/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000214/* case 0x000C: * FORM FEED */
215/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000216 0, 1, 1, 1, 1, 1, 0, 0,
217 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000218/* case 0x001C: * FILE SEPARATOR */
219/* case 0x001D: * GROUP SEPARATOR */
220/* case 0x001E: * RECORD SEPARATOR */
221/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000222 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000223/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000224 1, 0, 0, 0, 0, 0, 0, 0,
225 0, 0, 0, 0, 0, 0, 0, 0,
226 0, 0, 0, 0, 0, 0, 0, 0,
227 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000228
Benjamin Peterson14339b62009-01-31 16:36:08 +0000229 0, 0, 0, 0, 0, 0, 0, 0,
230 0, 0, 0, 0, 0, 0, 0, 0,
231 0, 0, 0, 0, 0, 0, 0, 0,
232 0, 0, 0, 0, 0, 0, 0, 0,
233 0, 0, 0, 0, 0, 0, 0, 0,
234 0, 0, 0, 0, 0, 0, 0, 0,
235 0, 0, 0, 0, 0, 0, 0, 0,
236 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000237};
238
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200239/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200241static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100242static int unicode_modifiable(PyObject *unicode);
243
Victor Stinnerfe226c02011-10-03 03:52:20 +0200244
Alexander Belopolsky40018472011-02-26 01:02:56 +0000245static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100246_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200247static PyObject *
248_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
249static PyObject *
250_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
251
252static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000253unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000254 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100255 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000256 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
257
Alexander Belopolsky40018472011-02-26 01:02:56 +0000258static void
259raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300260 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100261 PyObject *unicode,
262 Py_ssize_t startpos, Py_ssize_t endpos,
263 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000264
Christian Heimes190d79e2008-01-30 11:58:22 +0000265/* Same for linebreaks */
266static unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000267 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000268/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000269/* 0x000B, * LINE TABULATION */
270/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000271/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000272 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000273 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000274/* 0x001C, * FILE SEPARATOR */
275/* 0x001D, * GROUP SEPARATOR */
276/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000277 0, 0, 0, 0, 1, 1, 1, 0,
278 0, 0, 0, 0, 0, 0, 0, 0,
279 0, 0, 0, 0, 0, 0, 0, 0,
280 0, 0, 0, 0, 0, 0, 0, 0,
281 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000282
Benjamin Peterson14339b62009-01-31 16:36:08 +0000283 0, 0, 0, 0, 0, 0, 0, 0,
284 0, 0, 0, 0, 0, 0, 0, 0,
285 0, 0, 0, 0, 0, 0, 0, 0,
286 0, 0, 0, 0, 0, 0, 0, 0,
287 0, 0, 0, 0, 0, 0, 0, 0,
288 0, 0, 0, 0, 0, 0, 0, 0,
289 0, 0, 0, 0, 0, 0, 0, 0,
290 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000291};
292
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300293#include "clinic/unicodeobject.c.h"
294
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300295/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
296 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000297Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000298PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000299{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000300#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000302#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000303 /* This is actually an illegal character, so it should
304 not be passed to unichr. */
305 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000306#endif
307}
308
Victor Stinner910337b2011-10-03 03:20:16 +0200309#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200310int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100311_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200312{
313 PyASCIIObject *ascii;
314 unsigned int kind;
315
316 assert(PyUnicode_Check(op));
317
318 ascii = (PyASCIIObject *)op;
319 kind = ascii->state.kind;
320
Victor Stinnera3b334d2011-10-03 13:53:37 +0200321 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner910337b2011-10-03 03:20:16 +0200322 assert(kind == PyUnicode_1BYTE_KIND);
Victor Stinner910337b2011-10-03 03:20:16 +0200323 assert(ascii->state.ready == 1);
324 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200325 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200326 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200327 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200328
Victor Stinnera41463c2011-10-04 01:05:08 +0200329 if (ascii->state.compact == 1) {
330 data = compact + 1;
Victor Stinner910337b2011-10-03 03:20:16 +0200331 assert(kind == PyUnicode_1BYTE_KIND
332 || kind == PyUnicode_2BYTE_KIND
333 || kind == PyUnicode_4BYTE_KIND);
Victor Stinnera41463c2011-10-04 01:05:08 +0200334 assert(ascii->state.ascii == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200335 assert(ascii->state.ready == 1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200336 assert (compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100337 }
338 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200339 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
340
341 data = unicode->data.any;
342 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinnere30c0a12011-11-04 20:54:05 +0100343 assert(ascii->length == 0);
344 assert(ascii->hash == -1);
Victor Stinnera41463c2011-10-04 01:05:08 +0200345 assert(ascii->state.compact == 0);
346 assert(ascii->state.ascii == 0);
347 assert(ascii->state.ready == 0);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100348 assert(ascii->state.interned == SSTATE_NOT_INTERNED);
Victor Stinnera41463c2011-10-04 01:05:08 +0200349 assert(ascii->wstr != NULL);
350 assert(data == NULL);
351 assert(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200352 }
353 else {
354 assert(kind == PyUnicode_1BYTE_KIND
355 || kind == PyUnicode_2BYTE_KIND
356 || kind == PyUnicode_4BYTE_KIND);
357 assert(ascii->state.compact == 0);
358 assert(ascii->state.ready == 1);
359 assert(data != NULL);
360 if (ascii->state.ascii) {
361 assert (compact->utf8 == data);
362 assert (compact->utf8_length == ascii->length);
363 }
364 else
365 assert (compact->utf8 != data);
366 }
367 }
368 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200369 if (
370#if SIZEOF_WCHAR_T == 2
371 kind == PyUnicode_2BYTE_KIND
372#else
373 kind == PyUnicode_4BYTE_KIND
374#endif
375 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200376 {
377 assert(ascii->wstr == data);
378 assert(compact->wstr_length == ascii->length);
379 } else
380 assert(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200381 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200382
383 if (compact->utf8 == NULL)
384 assert(compact->utf8_length == 0);
385 if (ascii->wstr == NULL)
386 assert(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200387 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200388 /* check that the best kind is used */
389 if (check_content && kind != PyUnicode_WCHAR_KIND)
390 {
391 Py_ssize_t i;
392 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200393 void *data;
394 Py_UCS4 ch;
395
396 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200397 for (i=0; i < ascii->length; i++)
398 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200399 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200400 if (ch > maxchar)
401 maxchar = ch;
402 }
403 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100404 if (ascii->state.ascii == 0) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200405 assert(maxchar >= 128);
Victor Stinner77faf692011-11-20 18:56:05 +0100406 assert(maxchar <= 255);
407 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200408 else
409 assert(maxchar < 128);
410 }
Victor Stinner77faf692011-11-20 18:56:05 +0100411 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200412 assert(maxchar >= 0x100);
Victor Stinner77faf692011-11-20 18:56:05 +0100413 assert(maxchar <= 0xFFFF);
414 }
415 else {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200416 assert(maxchar >= 0x10000);
Victor Stinner8faf8212011-12-08 22:14:11 +0100417 assert(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100418 }
Victor Stinner718fbf02012-04-26 00:39:37 +0200419 assert(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200420 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400421 return 1;
422}
Victor Stinner910337b2011-10-03 03:20:16 +0200423#endif
424
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100425static PyObject*
426unicode_result_wchar(PyObject *unicode)
427{
428#ifndef Py_DEBUG
429 Py_ssize_t len;
430
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100431 len = _PyUnicode_WSTR_LENGTH(unicode);
432 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100433 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200434 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100435 }
436
437 if (len == 1) {
438 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100439 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100440 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
441 Py_DECREF(unicode);
442 return latin1_char;
443 }
444 }
445
446 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200447 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100448 return NULL;
449 }
450#else
Victor Stinneraa771272012-10-04 02:32:58 +0200451 assert(Py_REFCNT(unicode) == 1);
452
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100453 /* don't make the result ready in debug mode to ensure that the caller
454 makes the string ready before using it */
455 assert(_PyUnicode_CheckConsistency(unicode, 1));
456#endif
457 return unicode;
458}
459
460static PyObject*
461unicode_result_ready(PyObject *unicode)
462{
463 Py_ssize_t length;
464
465 length = PyUnicode_GET_LENGTH(unicode);
466 if (length == 0) {
467 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100468 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200469 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100470 }
471 return unicode_empty;
472 }
473
474 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200475 void *data = PyUnicode_DATA(unicode);
476 int kind = PyUnicode_KIND(unicode);
477 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100478 if (ch < 256) {
479 PyObject *latin1_char = unicode_latin1[ch];
480 if (latin1_char != NULL) {
481 if (unicode != latin1_char) {
482 Py_INCREF(latin1_char);
483 Py_DECREF(unicode);
484 }
485 return latin1_char;
486 }
487 else {
488 assert(_PyUnicode_CheckConsistency(unicode, 1));
489 Py_INCREF(unicode);
490 unicode_latin1[ch] = unicode;
491 return unicode;
492 }
493 }
494 }
495
496 assert(_PyUnicode_CheckConsistency(unicode, 1));
497 return unicode;
498}
499
500static PyObject*
501unicode_result(PyObject *unicode)
502{
503 assert(_PyUnicode_CHECK(unicode));
504 if (PyUnicode_IS_READY(unicode))
505 return unicode_result_ready(unicode);
506 else
507 return unicode_result_wchar(unicode);
508}
509
Victor Stinnerc4b49542011-12-11 22:44:26 +0100510static PyObject*
511unicode_result_unchanged(PyObject *unicode)
512{
513 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500514 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100515 return NULL;
516 Py_INCREF(unicode);
517 return unicode;
518 }
519 else
520 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100521 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100522}
523
Thomas Wouters477c8d52006-05-27 19:21:47 +0000524/* --- Bloom Filters ----------------------------------------------------- */
525
526/* stuff to implement simple "bloom filters" for Unicode characters.
527 to keep things simple, we use a single bitmask, using the least 5
528 bits from each unicode characters as the bit index. */
529
530/* the linebreak mask is set up by Unicode_Init below */
531
Antoine Pitrouf068f942010-01-13 14:19:12 +0000532#if LONG_BIT >= 128
533#define BLOOM_WIDTH 128
534#elif LONG_BIT >= 64
535#define BLOOM_WIDTH 64
536#elif LONG_BIT >= 32
537#define BLOOM_WIDTH 32
538#else
539#error "LONG_BIT is smaller than 32"
540#endif
541
Thomas Wouters477c8d52006-05-27 19:21:47 +0000542#define BLOOM_MASK unsigned long
543
Serhiy Storchaka05997252013-01-26 12:14:02 +0200544static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000545
Antoine Pitrouf068f942010-01-13 14:19:12 +0000546#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000547
Benjamin Peterson29060642009-01-31 22:14:21 +0000548#define BLOOM_LINEBREAK(ch) \
549 ((ch) < 128U ? ascii_linebreak[(ch)] : \
550 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000551
Alexander Belopolsky40018472011-02-26 01:02:56 +0000552Py_LOCAL_INLINE(BLOOM_MASK)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200553make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000554{
Victor Stinnera85af502013-04-09 21:53:54 +0200555#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
556 do { \
557 TYPE *data = (TYPE *)PTR; \
558 TYPE *end = data + LEN; \
559 Py_UCS4 ch; \
560 for (; data != end; data++) { \
561 ch = *data; \
562 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
563 } \
564 break; \
565 } while (0)
566
Thomas Wouters477c8d52006-05-27 19:21:47 +0000567 /* calculate simple bloom-style bitmask for a given unicode string */
568
Antoine Pitrouf068f942010-01-13 14:19:12 +0000569 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000570
571 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200572 switch (kind) {
573 case PyUnicode_1BYTE_KIND:
574 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
575 break;
576 case PyUnicode_2BYTE_KIND:
577 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
578 break;
579 case PyUnicode_4BYTE_KIND:
580 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
581 break;
582 default:
583 assert(0);
584 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000585 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200586
587#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000588}
589
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200590/* Compilation of templated routines */
591
592#include "stringlib/asciilib.h"
593#include "stringlib/fastsearch.h"
594#include "stringlib/partition.h"
595#include "stringlib/split.h"
596#include "stringlib/count.h"
597#include "stringlib/find.h"
598#include "stringlib/find_max_char.h"
599#include "stringlib/localeutil.h"
600#include "stringlib/undef.h"
601
602#include "stringlib/ucs1lib.h"
603#include "stringlib/fastsearch.h"
604#include "stringlib/partition.h"
605#include "stringlib/split.h"
606#include "stringlib/count.h"
607#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300608#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200609#include "stringlib/find_max_char.h"
610#include "stringlib/localeutil.h"
611#include "stringlib/undef.h"
612
613#include "stringlib/ucs2lib.h"
614#include "stringlib/fastsearch.h"
615#include "stringlib/partition.h"
616#include "stringlib/split.h"
617#include "stringlib/count.h"
618#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300619#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200620#include "stringlib/find_max_char.h"
621#include "stringlib/localeutil.h"
622#include "stringlib/undef.h"
623
624#include "stringlib/ucs4lib.h"
625#include "stringlib/fastsearch.h"
626#include "stringlib/partition.h"
627#include "stringlib/split.h"
628#include "stringlib/count.h"
629#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300630#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200631#include "stringlib/find_max_char.h"
632#include "stringlib/localeutil.h"
633#include "stringlib/undef.h"
634
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200635#include "stringlib/unicodedefs.h"
636#include "stringlib/fastsearch.h"
637#include "stringlib/count.h"
638#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100639#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200640
Guido van Rossumd57fd912000-03-10 22:53:23 +0000641/* --- Unicode Object ----------------------------------------------------- */
642
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200643static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +0200644fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200645
Serhiy Storchakad9d769f2015-03-24 21:55:47 +0200646Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind,
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200647 Py_ssize_t size, Py_UCS4 ch,
648 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200649{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200650 int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
651
652 switch (kind) {
653 case PyUnicode_1BYTE_KIND:
654 {
655 Py_UCS1 ch1 = (Py_UCS1) ch;
656 if (ch1 == ch)
657 return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
658 else
659 return -1;
660 }
661 case PyUnicode_2BYTE_KIND:
662 {
663 Py_UCS2 ch2 = (Py_UCS2) ch;
664 if (ch2 == ch)
665 return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
666 else
667 return -1;
668 }
669 case PyUnicode_4BYTE_KIND:
670 return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
671 default:
672 assert(0);
673 return -1;
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200674 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200675}
676
Victor Stinnerafffce42012-10-03 23:03:17 +0200677#ifdef Py_DEBUG
678/* Fill the data of an Unicode string with invalid characters to detect bugs
679 earlier.
680
681 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
682 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
683 invalid character in Unicode 6.0. */
684static void
685unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
686{
687 int kind = PyUnicode_KIND(unicode);
688 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
689 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
690 if (length <= old_length)
691 return;
692 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
693}
694#endif
695
Victor Stinnerfe226c02011-10-03 03:52:20 +0200696static PyObject*
697resize_compact(PyObject *unicode, Py_ssize_t length)
698{
699 Py_ssize_t char_size;
700 Py_ssize_t struct_size;
701 Py_ssize_t new_size;
702 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100703 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200704#ifdef Py_DEBUG
705 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
706#endif
707
Victor Stinner79891572012-05-03 13:43:07 +0200708 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200709 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100710 assert(PyUnicode_IS_COMPACT(unicode));
711
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200712 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100713 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200714 struct_size = sizeof(PyASCIIObject);
715 else
716 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200717 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200718
Victor Stinnerfe226c02011-10-03 03:52:20 +0200719 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
720 PyErr_NoMemory();
721 return NULL;
722 }
723 new_size = (struct_size + (length + 1) * char_size);
724
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200725 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
726 PyObject_DEL(_PyUnicode_UTF8(unicode));
727 _PyUnicode_UTF8(unicode) = NULL;
728 _PyUnicode_UTF8_LENGTH(unicode) = 0;
729 }
Victor Stinner84def372011-12-11 20:04:56 +0100730 _Py_DEC_REFTOTAL;
731 _Py_ForgetReference(unicode);
732
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300733 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100734 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100735 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200736 PyErr_NoMemory();
737 return NULL;
738 }
Victor Stinner84def372011-12-11 20:04:56 +0100739 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200740 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +0100741
Victor Stinnerfe226c02011-10-03 03:52:20 +0200742 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200743 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200744 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100745 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +0200746 _PyUnicode_WSTR_LENGTH(unicode) = length;
747 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +0100748 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
749 PyObject_DEL(_PyUnicode_WSTR(unicode));
750 _PyUnicode_WSTR(unicode) = NULL;
751 }
Victor Stinnerafffce42012-10-03 23:03:17 +0200752#ifdef Py_DEBUG
753 unicode_fill_invalid(unicode, old_length);
754#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200755 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
756 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +0200757 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200758 return unicode;
759}
760
Alexander Belopolsky40018472011-02-26 01:02:56 +0000761static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200762resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000763{
Victor Stinner95663112011-10-04 01:03:50 +0200764 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100765 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200766 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200767 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +0000768
Victor Stinnerfe226c02011-10-03 03:52:20 +0200769 if (PyUnicode_IS_READY(unicode)) {
770 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200771 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200772 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +0200773#ifdef Py_DEBUG
774 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
775#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +0200776
777 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200778 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200779 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
780 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200781
782 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
783 PyErr_NoMemory();
784 return -1;
785 }
786 new_size = (length + 1) * char_size;
787
Victor Stinner7a9105a2011-12-12 00:13:42 +0100788 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
789 {
790 PyObject_DEL(_PyUnicode_UTF8(unicode));
791 _PyUnicode_UTF8(unicode) = NULL;
792 _PyUnicode_UTF8_LENGTH(unicode) = 0;
793 }
794
Victor Stinnerfe226c02011-10-03 03:52:20 +0200795 data = (PyObject *)PyObject_REALLOC(data, new_size);
796 if (data == NULL) {
797 PyErr_NoMemory();
798 return -1;
799 }
800 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200801 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200802 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200803 _PyUnicode_WSTR_LENGTH(unicode) = length;
804 }
805 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +0200806 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +0200807 _PyUnicode_UTF8_LENGTH(unicode) = length;
808 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200809 _PyUnicode_LENGTH(unicode) = length;
810 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +0200811#ifdef Py_DEBUG
812 unicode_fill_invalid(unicode, old_length);
813#endif
Victor Stinner95663112011-10-04 01:03:50 +0200814 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200815 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200816 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200817 }
Victor Stinnerfe226c02011-10-03 03:52:20 +0200818 }
Victor Stinner95663112011-10-04 01:03:50 +0200819 assert(_PyUnicode_WSTR(unicode) != NULL);
820
821 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -0700822 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +0200823 PyErr_NoMemory();
824 return -1;
825 }
Victor Stinner7a9105a2011-12-12 00:13:42 +0100826 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +0200827 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +0100828 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +0200829 if (!wstr) {
830 PyErr_NoMemory();
831 return -1;
832 }
833 _PyUnicode_WSTR(unicode) = wstr;
834 _PyUnicode_WSTR(unicode)[length] = 0;
835 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200836 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000837 return 0;
838}
839
Victor Stinnerfe226c02011-10-03 03:52:20 +0200840static PyObject*
841resize_copy(PyObject *unicode, Py_ssize_t length)
842{
843 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100844 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +0200845 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100846
Benjamin Petersonbac79492012-01-14 13:34:47 -0500847 if (PyUnicode_READY(unicode) == -1)
Victor Stinner7a9105a2011-12-12 00:13:42 +0100848 return NULL;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200849
850 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
851 if (copy == NULL)
852 return NULL;
853
854 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +0200855 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200856 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +0200857 }
858 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200859 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +0100860
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200861 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200862 if (w == NULL)
863 return NULL;
864 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
865 copy_length = Py_MIN(copy_length, length);
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +0200866 Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
867 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +0200868 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200869 }
870}
871
Guido van Rossumd57fd912000-03-10 22:53:23 +0000872/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +0000873 Ux0000 terminated; some code (e.g. new_identifier)
874 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000875
876 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +0000877 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000878
879*/
880
Alexander Belopolsky40018472011-02-26 01:02:56 +0000881static PyUnicodeObject *
882_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000883{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +0200884 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200885 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000886
Thomas Wouters477c8d52006-05-27 19:21:47 +0000887 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000888 if (length == 0 && unicode_empty != NULL) {
889 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +0200890 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000891 }
892
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000893 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -0700894 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +0000895 return (PyUnicodeObject *)PyErr_NoMemory();
896 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200897 if (length < 0) {
898 PyErr_SetString(PyExc_SystemError,
899 "Negative size passed to _PyUnicode_New");
900 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000901 }
902
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200903 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
904 if (unicode == NULL)
905 return NULL;
906 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +0100907
908 _PyUnicode_WSTR_LENGTH(unicode) = length;
909 _PyUnicode_HASH(unicode) = -1;
910 _PyUnicode_STATE(unicode).interned = 0;
911 _PyUnicode_STATE(unicode).kind = 0;
912 _PyUnicode_STATE(unicode).compact = 0;
913 _PyUnicode_STATE(unicode).ready = 0;
914 _PyUnicode_STATE(unicode).ascii = 0;
915 _PyUnicode_DATA_ANY(unicode) = NULL;
916 _PyUnicode_LENGTH(unicode) = 0;
917 _PyUnicode_UTF8(unicode) = NULL;
918 _PyUnicode_UTF8_LENGTH(unicode) = 0;
919
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200920 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
921 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100922 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +0000923 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100924 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000925 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200926
Jeremy Hyltond8082792003-09-16 19:41:39 +0000927 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000928 * the caller fails before initializing str -- unicode_resize()
929 * reads str[0], and the Keep-Alive optimization can keep memory
930 * allocated for str alive across a call to unicode_dealloc(unicode).
931 * We don't want unicode_resize to read uninitialized memory in
932 * that case.
933 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200934 _PyUnicode_WSTR(unicode)[0] = 0;
935 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +0100936
Victor Stinner7931d9a2011-11-04 00:22:48 +0100937 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000938 return unicode;
939}
940
Victor Stinnerf42dc442011-10-02 23:33:16 +0200941static const char*
942unicode_kind_name(PyObject *unicode)
943{
Victor Stinner42dfd712011-10-03 14:41:45 +0200944 /* don't check consistency: unicode_kind_name() is called from
945 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +0200946 if (!PyUnicode_IS_COMPACT(unicode))
947 {
948 if (!PyUnicode_IS_READY(unicode))
949 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -0600950 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200951 {
952 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200953 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200954 return "legacy ascii";
955 else
956 return "legacy latin1";
957 case PyUnicode_2BYTE_KIND:
958 return "legacy UCS2";
959 case PyUnicode_4BYTE_KIND:
960 return "legacy UCS4";
961 default:
962 return "<legacy invalid kind>";
963 }
964 }
965 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -0600966 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +0200967 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200968 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +0200969 return "ascii";
970 else
Victor Stinnera3b334d2011-10-03 13:53:37 +0200971 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200972 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200973 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200974 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +0200975 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +0200976 default:
977 return "<invalid compact kind>";
978 }
979}
980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200981#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200982/* Functions wrapping macros for use in debugger */
983char *_PyUnicode_utf8(void *unicode){
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200984 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985}
986
987void *_PyUnicode_compact_data(void *unicode) {
988 return _PyUnicode_COMPACT_DATA(unicode);
989}
990void *_PyUnicode_data(void *unicode){
991 printf("obj %p\n", unicode);
992 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
993 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
994 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
995 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
996 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
997 return PyUnicode_DATA(unicode);
998}
Victor Stinnerfe0c1552011-10-03 02:59:31 +0200999
1000void
1001_PyUnicode_Dump(PyObject *op)
1002{
1003 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001004 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1005 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1006 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001007
Victor Stinnera849a4b2011-10-03 12:12:11 +02001008 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001009 {
1010 if (ascii->state.ascii)
1011 data = (ascii + 1);
1012 else
1013 data = (compact + 1);
1014 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001015 else
1016 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001017 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1018 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001019
Victor Stinnera849a4b2011-10-03 12:12:11 +02001020 if (ascii->wstr == data)
1021 printf("shared ");
1022 printf("wstr=%p", ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001023
Victor Stinnera3b334d2011-10-03 13:53:37 +02001024 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001025 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001026 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1027 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001028 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1029 compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001030 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001031 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001032}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001033#endif
1034
1035PyObject *
1036PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1037{
1038 PyObject *obj;
1039 PyCompactUnicodeObject *unicode;
1040 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001041 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001042 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001043 Py_ssize_t char_size;
1044 Py_ssize_t struct_size;
1045
1046 /* Optimization for empty strings */
1047 if (size == 0 && unicode_empty != NULL) {
1048 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001049 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001050 }
1051
Victor Stinner9e9d6892011-10-04 01:02:02 +02001052 is_ascii = 0;
1053 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001054 struct_size = sizeof(PyCompactUnicodeObject);
1055 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001056 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001057 char_size = 1;
1058 is_ascii = 1;
1059 struct_size = sizeof(PyASCIIObject);
1060 }
1061 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001062 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001063 char_size = 1;
1064 }
1065 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001066 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001067 char_size = 2;
1068 if (sizeof(wchar_t) == 2)
1069 is_sharing = 1;
1070 }
1071 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001072 if (maxchar > MAX_UNICODE) {
1073 PyErr_SetString(PyExc_SystemError,
1074 "invalid maximum character passed to PyUnicode_New");
1075 return NULL;
1076 }
Victor Stinner8f825062012-04-27 13:55:39 +02001077 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001078 char_size = 4;
1079 if (sizeof(wchar_t) == 4)
1080 is_sharing = 1;
1081 }
1082
1083 /* Ensure we won't overflow the size. */
1084 if (size < 0) {
1085 PyErr_SetString(PyExc_SystemError,
1086 "Negative size passed to PyUnicode_New");
1087 return NULL;
1088 }
1089 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1090 return PyErr_NoMemory();
1091
1092 /* Duplicated allocation code from _PyObject_New() instead of a call to
1093 * PyObject_New() so we are able to allocate space for the object and
1094 * it's data buffer.
1095 */
1096 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1097 if (obj == NULL)
1098 return PyErr_NoMemory();
1099 obj = PyObject_INIT(obj, &PyUnicode_Type);
1100 if (obj == NULL)
1101 return NULL;
1102
1103 unicode = (PyCompactUnicodeObject *)obj;
1104 if (is_ascii)
1105 data = ((PyASCIIObject*)obj) + 1;
1106 else
1107 data = unicode + 1;
1108 _PyUnicode_LENGTH(unicode) = size;
1109 _PyUnicode_HASH(unicode) = -1;
1110 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001111 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001112 _PyUnicode_STATE(unicode).compact = 1;
1113 _PyUnicode_STATE(unicode).ready = 1;
1114 _PyUnicode_STATE(unicode).ascii = is_ascii;
1115 if (is_ascii) {
1116 ((char*)data)[size] = 0;
1117 _PyUnicode_WSTR(unicode) = NULL;
1118 }
Victor Stinner8f825062012-04-27 13:55:39 +02001119 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001120 ((char*)data)[size] = 0;
1121 _PyUnicode_WSTR(unicode) = NULL;
1122 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001123 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001124 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001125 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001126 else {
1127 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001128 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001129 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001130 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001131 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001132 ((Py_UCS4*)data)[size] = 0;
1133 if (is_sharing) {
1134 _PyUnicode_WSTR_LENGTH(unicode) = size;
1135 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1136 }
1137 else {
1138 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1139 _PyUnicode_WSTR(unicode) = NULL;
1140 }
1141 }
Victor Stinner8f825062012-04-27 13:55:39 +02001142#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001143 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001144#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001145 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001146 return obj;
1147}
1148
1149#if SIZEOF_WCHAR_T == 2
1150/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1151 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001152 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001153
1154 This function assumes that unicode can hold one more code point than wstr
1155 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001156static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001157unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001158 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001159{
1160 const wchar_t *iter;
1161 Py_UCS4 *ucs4_out;
1162
Victor Stinner910337b2011-10-03 03:20:16 +02001163 assert(unicode != NULL);
1164 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001165 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1166 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1167
1168 for (iter = begin; iter < end; ) {
1169 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1170 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001171 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1172 && (iter+1) < end
1173 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001174 {
Victor Stinner551ac952011-11-29 22:58:13 +01001175 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001176 iter += 2;
1177 }
1178 else {
1179 *ucs4_out++ = *iter;
1180 iter++;
1181 }
1182 }
1183 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1184 _PyUnicode_GET_LENGTH(unicode)));
1185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001186}
1187#endif
1188
Victor Stinnercd9950f2011-10-02 00:34:53 +02001189static int
Victor Stinner488fa492011-12-12 00:01:39 +01001190unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001191{
Victor Stinner488fa492011-12-12 00:01:39 +01001192 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001193 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001194 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001195 return -1;
1196 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001197 return 0;
1198}
1199
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001200static int
1201_copy_characters(PyObject *to, Py_ssize_t to_start,
1202 PyObject *from, Py_ssize_t from_start,
1203 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001204{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001205 unsigned int from_kind, to_kind;
1206 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001207
Victor Stinneree4544c2012-05-09 22:24:08 +02001208 assert(0 <= how_many);
1209 assert(0 <= from_start);
1210 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001211 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001212 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001213 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001214
Victor Stinnerd3f08822012-05-29 12:57:52 +02001215 assert(PyUnicode_Check(to));
1216 assert(PyUnicode_IS_READY(to));
1217 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1218
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001219 if (how_many == 0)
1220 return 0;
1221
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001222 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001223 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001224 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001225 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001226
Victor Stinnerf1852262012-06-16 16:38:26 +02001227#ifdef Py_DEBUG
1228 if (!check_maxchar
1229 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1230 {
1231 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1232 Py_UCS4 ch;
1233 Py_ssize_t i;
1234 for (i=0; i < how_many; i++) {
1235 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1236 assert(ch <= to_maxchar);
1237 }
1238 }
1239#endif
1240
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001241 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001242 if (check_maxchar
1243 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1244 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001245 /* Writing Latin-1 characters into an ASCII string requires to
1246 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001247 Py_UCS4 max_char;
1248 max_char = ucs1lib_find_max_char(from_data,
1249 (Py_UCS1*)from_data + how_many);
1250 if (max_char >= 128)
1251 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001252 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001253 Py_MEMCPY((char*)to_data + to_kind * to_start,
1254 (char*)from_data + from_kind * from_start,
1255 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001256 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001257 else if (from_kind == PyUnicode_1BYTE_KIND
1258 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001259 {
1260 _PyUnicode_CONVERT_BYTES(
1261 Py_UCS1, Py_UCS2,
1262 PyUnicode_1BYTE_DATA(from) + from_start,
1263 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1264 PyUnicode_2BYTE_DATA(to) + to_start
1265 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001266 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001267 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001268 && to_kind == PyUnicode_4BYTE_KIND)
1269 {
1270 _PyUnicode_CONVERT_BYTES(
1271 Py_UCS1, Py_UCS4,
1272 PyUnicode_1BYTE_DATA(from) + from_start,
1273 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1274 PyUnicode_4BYTE_DATA(to) + to_start
1275 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001276 }
1277 else if (from_kind == PyUnicode_2BYTE_KIND
1278 && to_kind == PyUnicode_4BYTE_KIND)
1279 {
1280 _PyUnicode_CONVERT_BYTES(
1281 Py_UCS2, Py_UCS4,
1282 PyUnicode_2BYTE_DATA(from) + from_start,
1283 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1284 PyUnicode_4BYTE_DATA(to) + to_start
1285 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001286 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001287 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001288 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1289
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001290 if (!check_maxchar) {
1291 if (from_kind == PyUnicode_2BYTE_KIND
1292 && to_kind == PyUnicode_1BYTE_KIND)
1293 {
1294 _PyUnicode_CONVERT_BYTES(
1295 Py_UCS2, Py_UCS1,
1296 PyUnicode_2BYTE_DATA(from) + from_start,
1297 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1298 PyUnicode_1BYTE_DATA(to) + to_start
1299 );
1300 }
1301 else if (from_kind == PyUnicode_4BYTE_KIND
1302 && to_kind == PyUnicode_1BYTE_KIND)
1303 {
1304 _PyUnicode_CONVERT_BYTES(
1305 Py_UCS4, Py_UCS1,
1306 PyUnicode_4BYTE_DATA(from) + from_start,
1307 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1308 PyUnicode_1BYTE_DATA(to) + to_start
1309 );
1310 }
1311 else if (from_kind == PyUnicode_4BYTE_KIND
1312 && to_kind == PyUnicode_2BYTE_KIND)
1313 {
1314 _PyUnicode_CONVERT_BYTES(
1315 Py_UCS4, Py_UCS2,
1316 PyUnicode_4BYTE_DATA(from) + from_start,
1317 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1318 PyUnicode_2BYTE_DATA(to) + to_start
1319 );
1320 }
1321 else {
1322 assert(0);
1323 return -1;
1324 }
1325 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001326 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001327 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001328 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001329 Py_ssize_t i;
1330
Victor Stinnera0702ab2011-09-29 14:14:38 +02001331 for (i=0; i < how_many; i++) {
1332 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001333 if (ch > to_maxchar)
1334 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001335 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1336 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001337 }
1338 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001339 return 0;
1340}
1341
Victor Stinnerd3f08822012-05-29 12:57:52 +02001342void
1343_PyUnicode_FastCopyCharacters(
1344 PyObject *to, Py_ssize_t to_start,
1345 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001346{
1347 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1348}
1349
1350Py_ssize_t
1351PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1352 PyObject *from, Py_ssize_t from_start,
1353 Py_ssize_t how_many)
1354{
1355 int err;
1356
1357 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1358 PyErr_BadInternalCall();
1359 return -1;
1360 }
1361
Benjamin Petersonbac79492012-01-14 13:34:47 -05001362 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001363 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001364 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001365 return -1;
1366
Victor Stinnerd3f08822012-05-29 12:57:52 +02001367 if (from_start < 0) {
1368 PyErr_SetString(PyExc_IndexError, "string index out of range");
1369 return -1;
1370 }
1371 if (to_start < 0) {
1372 PyErr_SetString(PyExc_IndexError, "string index out of range");
1373 return -1;
1374 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001375 how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1376 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1377 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001378 "Cannot write %zi characters at %zi "
1379 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001380 how_many, to_start, PyUnicode_GET_LENGTH(to));
1381 return -1;
1382 }
1383
1384 if (how_many == 0)
1385 return 0;
1386
Victor Stinner488fa492011-12-12 00:01:39 +01001387 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001388 return -1;
1389
1390 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1391 if (err) {
1392 PyErr_Format(PyExc_SystemError,
1393 "Cannot copy %s characters "
1394 "into a string of %s characters",
1395 unicode_kind_name(from),
1396 unicode_kind_name(to));
1397 return -1;
1398 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001399 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400}
1401
Victor Stinner17222162011-09-28 22:15:37 +02001402/* Find the maximum code point and count the number of surrogate pairs so a
1403 correct string length can be computed before converting a string to UCS4.
1404 This function counts single surrogates as a character and not as a pair.
1405
1406 Return 0 on success, or -1 on error. */
1407static int
1408find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1409 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410{
1411 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001412 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413
Victor Stinnerc53be962011-10-02 21:33:54 +02001414 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415 *num_surrogates = 0;
1416 *maxchar = 0;
1417
1418 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001419#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001420 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1421 && (iter+1) < end
1422 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1423 {
1424 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1425 ++(*num_surrogates);
1426 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001427 }
1428 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001429#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001430 {
1431 ch = *iter;
1432 iter++;
1433 }
1434 if (ch > *maxchar) {
1435 *maxchar = ch;
1436 if (*maxchar > MAX_UNICODE) {
1437 PyErr_Format(PyExc_ValueError,
1438 "character U+%x is not in range [U+0000; U+10ffff]",
1439 ch);
1440 return -1;
1441 }
1442 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001443 }
1444 return 0;
1445}
1446
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001447int
1448_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001449{
1450 wchar_t *end;
1451 Py_UCS4 maxchar = 0;
1452 Py_ssize_t num_surrogates;
1453#if SIZEOF_WCHAR_T == 2
1454 Py_ssize_t length_wo_surrogates;
1455#endif
1456
Georg Brandl7597add2011-10-05 16:36:47 +02001457 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001458 strings were created using _PyObject_New() and where no canonical
1459 representation (the str field) has been set yet aka strings
1460 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001461 assert(_PyUnicode_CHECK(unicode));
1462 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001463 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001464 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001465 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001466 /* Actually, it should neither be interned nor be anything else: */
1467 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001469 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001470 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001471 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473
1474 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001475 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1476 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001477 PyErr_NoMemory();
1478 return -1;
1479 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001480 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001481 _PyUnicode_WSTR(unicode), end,
1482 PyUnicode_1BYTE_DATA(unicode));
1483 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1484 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1485 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1486 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001487 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001488 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001489 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 }
1491 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001492 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001493 _PyUnicode_UTF8(unicode) = NULL;
1494 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001495 }
1496 PyObject_FREE(_PyUnicode_WSTR(unicode));
1497 _PyUnicode_WSTR(unicode) = NULL;
1498 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1499 }
1500 /* In this case we might have to convert down from 4-byte native
1501 wchar_t to 2-byte unicode. */
1502 else if (maxchar < 65536) {
1503 assert(num_surrogates == 0 &&
1504 "FindMaxCharAndNumSurrogatePairs() messed up");
1505
Victor Stinner506f5922011-09-28 22:34:18 +02001506#if SIZEOF_WCHAR_T == 2
1507 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001508 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001509 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1510 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1511 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001512 _PyUnicode_UTF8(unicode) = NULL;
1513 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001514#else
1515 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001516 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001517 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001518 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001519 PyErr_NoMemory();
1520 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001521 }
Victor Stinner506f5922011-09-28 22:34:18 +02001522 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1523 _PyUnicode_WSTR(unicode), end,
1524 PyUnicode_2BYTE_DATA(unicode));
1525 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1526 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1527 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001528 _PyUnicode_UTF8(unicode) = NULL;
1529 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001530 PyObject_FREE(_PyUnicode_WSTR(unicode));
1531 _PyUnicode_WSTR(unicode) = NULL;
1532 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1533#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001534 }
1535 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1536 else {
1537#if SIZEOF_WCHAR_T == 2
1538 /* in case the native representation is 2-bytes, we need to allocate a
1539 new normalized 4-byte version. */
1540 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001541 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1542 PyErr_NoMemory();
1543 return -1;
1544 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001545 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1546 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001547 PyErr_NoMemory();
1548 return -1;
1549 }
1550 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1551 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001552 _PyUnicode_UTF8(unicode) = NULL;
1553 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001554 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1555 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001556 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001557 PyObject_FREE(_PyUnicode_WSTR(unicode));
1558 _PyUnicode_WSTR(unicode) = NULL;
1559 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1560#else
1561 assert(num_surrogates == 0);
1562
Victor Stinnerc3c74152011-10-02 20:39:55 +02001563 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001564 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001565 _PyUnicode_UTF8(unicode) = NULL;
1566 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001567 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1568#endif
1569 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1570 }
1571 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001572 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001573 return 0;
1574}
1575
Alexander Belopolsky40018472011-02-26 01:02:56 +00001576static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001577unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001578{
Walter Dörwald16807132007-05-25 13:52:07 +00001579 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001580 case SSTATE_NOT_INTERNED:
1581 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001582
Benjamin Peterson29060642009-01-31 22:14:21 +00001583 case SSTATE_INTERNED_MORTAL:
1584 /* revive dead object temporarily for DelItem */
1585 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001586 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001587 Py_FatalError(
1588 "deletion of interned string failed");
1589 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001590
Benjamin Peterson29060642009-01-31 22:14:21 +00001591 case SSTATE_INTERNED_IMMORTAL:
1592 Py_FatalError("Immortal interned string died.");
Walter Dörwald16807132007-05-25 13:52:07 +00001593
Benjamin Peterson29060642009-01-31 22:14:21 +00001594 default:
1595 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001596 }
1597
Victor Stinner03490912011-10-03 23:45:12 +02001598 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001599 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001600 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001601 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001602 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1603 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001604
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001605 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001606}
1607
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001608#ifdef Py_DEBUG
1609static int
1610unicode_is_singleton(PyObject *unicode)
1611{
1612 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1613 if (unicode == unicode_empty)
1614 return 1;
1615 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1616 {
1617 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1618 if (ch < 256 && unicode_latin1[ch] == unicode)
1619 return 1;
1620 }
1621 return 0;
1622}
1623#endif
1624
Alexander Belopolsky40018472011-02-26 01:02:56 +00001625static int
Victor Stinner488fa492011-12-12 00:01:39 +01001626unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001627{
Victor Stinner488fa492011-12-12 00:01:39 +01001628 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001629 if (Py_REFCNT(unicode) != 1)
1630 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001631 if (_PyUnicode_HASH(unicode) != -1)
1632 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001633 if (PyUnicode_CHECK_INTERNED(unicode))
1634 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001635 if (!PyUnicode_CheckExact(unicode))
1636 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001637#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001638 /* singleton refcount is greater than 1 */
1639 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001640#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001641 return 1;
1642}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001643
Victor Stinnerfe226c02011-10-03 03:52:20 +02001644static int
1645unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1646{
1647 PyObject *unicode;
1648 Py_ssize_t old_length;
1649
1650 assert(p_unicode != NULL);
1651 unicode = *p_unicode;
1652
1653 assert(unicode != NULL);
1654 assert(PyUnicode_Check(unicode));
1655 assert(0 <= length);
1656
Victor Stinner910337b2011-10-03 03:20:16 +02001657 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001658 old_length = PyUnicode_WSTR_LENGTH(unicode);
1659 else
1660 old_length = PyUnicode_GET_LENGTH(unicode);
1661 if (old_length == length)
1662 return 0;
1663
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001664 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001665 _Py_INCREF_UNICODE_EMPTY();
1666 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001667 return -1;
Serhiy Storchaka5a57ade2015-12-24 10:35:59 +02001668 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001669 return 0;
1670 }
1671
Victor Stinner488fa492011-12-12 00:01:39 +01001672 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001673 PyObject *copy = resize_copy(unicode, length);
1674 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001675 return -1;
Serhiy Storchaka5a57ade2015-12-24 10:35:59 +02001676 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001677 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001678 }
1679
Victor Stinnerfe226c02011-10-03 03:52:20 +02001680 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001681 PyObject *new_unicode = resize_compact(unicode, length);
1682 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001683 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001684 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001685 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001686 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001687 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001688}
1689
Alexander Belopolsky40018472011-02-26 01:02:56 +00001690int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001691PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001692{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001693 PyObject *unicode;
1694 if (p_unicode == NULL) {
1695 PyErr_BadInternalCall();
1696 return -1;
1697 }
1698 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001699 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001700 {
1701 PyErr_BadInternalCall();
1702 return -1;
1703 }
1704 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001705}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001706
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001707/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001708
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001709 WARNING: The function doesn't copy the terminating null character and
1710 doesn't check the maximum character (may write a latin1 character in an
1711 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001712static void
1713unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1714 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001715{
1716 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1717 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001718 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001719
1720 switch (kind) {
1721 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001722 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001723#ifdef Py_DEBUG
1724 if (PyUnicode_IS_ASCII(unicode)) {
1725 Py_UCS4 maxchar = ucs1lib_find_max_char(
1726 (const Py_UCS1*)str,
1727 (const Py_UCS1*)str + len);
1728 assert(maxchar < 128);
1729 }
1730#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001731 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001732 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001733 }
1734 case PyUnicode_2BYTE_KIND: {
1735 Py_UCS2 *start = (Py_UCS2 *)data + index;
1736 Py_UCS2 *ucs2 = start;
1737 assert(index <= PyUnicode_GET_LENGTH(unicode));
1738
Victor Stinner184252a2012-06-16 02:57:41 +02001739 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001740 *ucs2 = (Py_UCS2)*str;
1741
1742 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02001743 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01001744 }
1745 default: {
1746 Py_UCS4 *start = (Py_UCS4 *)data + index;
1747 Py_UCS4 *ucs4 = start;
1748 assert(kind == PyUnicode_4BYTE_KIND);
1749 assert(index <= PyUnicode_GET_LENGTH(unicode));
1750
Victor Stinner184252a2012-06-16 02:57:41 +02001751 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01001752 *ucs4 = (Py_UCS4)*str;
1753
1754 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01001755 }
1756 }
1757}
1758
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001759static PyObject*
1760get_latin1_char(unsigned char ch)
1761{
Victor Stinnera464fc12011-10-02 20:39:30 +02001762 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02001764 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765 if (!unicode)
1766 return NULL;
1767 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001768 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769 unicode_latin1[ch] = unicode;
1770 }
1771 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02001772 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001773}
1774
Victor Stinner985a82a2014-01-03 12:53:47 +01001775static PyObject*
1776unicode_char(Py_UCS4 ch)
1777{
1778 PyObject *unicode;
1779
1780 assert(ch <= MAX_UNICODE);
1781
Victor Stinnerf3b46b42014-01-03 13:16:00 +01001782 if (ch < 256)
1783 return get_latin1_char(ch);
1784
Victor Stinner985a82a2014-01-03 12:53:47 +01001785 unicode = PyUnicode_New(1, ch);
1786 if (unicode == NULL)
1787 return NULL;
1788 switch (PyUnicode_KIND(unicode)) {
1789 case PyUnicode_1BYTE_KIND:
1790 PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1791 break;
1792 case PyUnicode_2BYTE_KIND:
1793 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1794 break;
1795 default:
1796 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1797 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1798 }
1799 assert(_PyUnicode_CheckConsistency(unicode, 1));
1800 return unicode;
1801}
1802
Alexander Belopolsky40018472011-02-26 01:02:56 +00001803PyObject *
1804PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001805{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001806 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001807 Py_UCS4 maxchar = 0;
1808 Py_ssize_t num_surrogates;
1809
1810 if (u == NULL)
1811 return (PyObject*)_PyUnicode_New(size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001812
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001813 /* If the Unicode data is known at construction time, we can apply
1814 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001816 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02001817 if (size == 0)
1818 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00001819
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001820 /* Single character Unicode objects in the Latin-1 range are
1821 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001822 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 return get_latin1_char((unsigned char)*u);
1824
1825 /* If not empty and not single character, copy the Unicode data
1826 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02001827 if (find_maxchar_surrogates(u, u + size,
1828 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001829 return NULL;
1830
Victor Stinner8faf8212011-12-08 22:14:11 +01001831 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001832 if (!unicode)
1833 return NULL;
1834
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835 switch (PyUnicode_KIND(unicode)) {
1836 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001837 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001838 u, u + size, PyUnicode_1BYTE_DATA(unicode));
1839 break;
1840 case PyUnicode_2BYTE_KIND:
1841#if Py_UNICODE_SIZE == 2
1842 Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1843#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001844 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001845 u, u + size, PyUnicode_2BYTE_DATA(unicode));
1846#endif
1847 break;
1848 case PyUnicode_4BYTE_KIND:
1849#if SIZEOF_WCHAR_T == 2
1850 /* This is the only case which has to process surrogates, thus
1851 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001852 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001853#else
1854 assert(num_surrogates == 0);
1855 Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1856#endif
1857 break;
1858 default:
1859 assert(0 && "Impossible state");
1860 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001861
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001862 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001863}
1864
Alexander Belopolsky40018472011-02-26 01:02:56 +00001865PyObject *
1866PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001867{
Benjamin Peterson14339b62009-01-31 16:36:08 +00001868 if (size < 0) {
1869 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00001870 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00001871 return NULL;
1872 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001873 if (u != NULL)
1874 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1875 else
1876 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00001877}
1878
Alexander Belopolsky40018472011-02-26 01:02:56 +00001879PyObject *
1880PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00001881{
1882 size_t size = strlen(u);
1883 if (size > PY_SSIZE_T_MAX) {
1884 PyErr_SetString(PyExc_OverflowError, "input too long");
1885 return NULL;
1886 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01001887 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00001888}
1889
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001890PyObject *
1891_PyUnicode_FromId(_Py_Identifier *id)
1892{
1893 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01001894 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1895 strlen(id->string),
1896 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001897 if (!id->object)
1898 return NULL;
1899 PyUnicode_InternInPlace(&id->object);
1900 assert(!id->next);
1901 id->next = static_strings;
1902 static_strings = id;
1903 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001904 return id->object;
1905}
1906
1907void
1908_PyUnicode_ClearStaticStrings()
1909{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001910 _Py_Identifier *tmp, *s = static_strings;
1911 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02001912 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001913 tmp = s->next;
1914 s->next = NULL;
1915 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001916 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06001917 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02001918}
1919
Benjamin Peterson0df54292012-03-26 14:50:32 -04001920/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001921
Victor Stinnerd3f08822012-05-29 12:57:52 +02001922PyObject*
1923_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02001924{
Victor Stinnerd3f08822012-05-29 12:57:52 +02001925 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01001926 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01001927 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02001928#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001929 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02001930#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001931 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01001932 }
Victor Stinner785938e2011-12-11 20:09:03 +01001933 unicode = PyUnicode_New(size, 127);
1934 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02001935 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01001936 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1937 assert(_PyUnicode_CheckConsistency(unicode, 1));
1938 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02001939}
1940
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001941static Py_UCS4
1942kind_maxchar_limit(unsigned int kind)
1943{
Benjamin Petersonead6b532011-12-20 17:23:42 -06001944 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001945 case PyUnicode_1BYTE_KIND:
1946 return 0x80;
1947 case PyUnicode_2BYTE_KIND:
1948 return 0x100;
1949 case PyUnicode_4BYTE_KIND:
1950 return 0x10000;
1951 default:
1952 assert(0 && "invalid kind");
Victor Stinner8faf8212011-12-08 22:14:11 +01001953 return MAX_UNICODE;
Victor Stinnerc80d6d22011-10-05 14:13:28 +02001954 }
1955}
1956
Victor Stinnere6abb482012-05-02 01:15:40 +02001957Py_LOCAL_INLINE(Py_UCS4)
1958align_maxchar(Py_UCS4 maxchar)
1959{
1960 if (maxchar <= 127)
1961 return 127;
1962 else if (maxchar <= 255)
1963 return 255;
1964 else if (maxchar <= 65535)
1965 return 65535;
1966 else
1967 return MAX_UNICODE;
1968}
1969
Victor Stinner702c7342011-10-05 13:50:52 +02001970static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01001971_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00001972{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001973 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001974 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001975
Serhiy Storchaka678db842013-01-26 12:16:36 +02001976 if (size == 0)
1977 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001978 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02001979 if (size == 1)
1980 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001981
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02001982 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02001983 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001984 if (!res)
1985 return NULL;
1986 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001987 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001988 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00001989}
1990
Victor Stinnere57b1c02011-09-28 22:20:48 +02001991static PyObject*
1992_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001993{
1994 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001995 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02001996
Serhiy Storchaka678db842013-01-26 12:16:36 +02001997 if (size == 0)
1998 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001999 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002000 if (size == 1)
2001 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002002
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002003 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002004 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002005 if (!res)
2006 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002007 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002008 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002009 else {
2010 _PyUnicode_CONVERT_BYTES(
2011 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2012 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002013 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002014 return res;
2015}
2016
Victor Stinnere57b1c02011-09-28 22:20:48 +02002017static PyObject*
2018_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002019{
2020 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002021 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002022
Serhiy Storchaka678db842013-01-26 12:16:36 +02002023 if (size == 0)
2024 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002025 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002026 if (size == 1)
2027 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002028
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002029 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002030 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002031 if (!res)
2032 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002033 if (max_char < 256)
2034 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2035 PyUnicode_1BYTE_DATA(res));
2036 else if (max_char < 0x10000)
2037 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2038 PyUnicode_2BYTE_DATA(res));
2039 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002040 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002041 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002042 return res;
2043}
2044
2045PyObject*
2046PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2047{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002048 if (size < 0) {
2049 PyErr_SetString(PyExc_ValueError, "size must be positive");
2050 return NULL;
2051 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002052 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002053 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002054 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002055 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002056 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002057 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002058 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002059 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002060 PyErr_SetString(PyExc_SystemError, "invalid kind");
2061 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002062 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002063}
2064
Victor Stinnerece58de2012-04-23 23:36:38 +02002065Py_UCS4
2066_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2067{
2068 enum PyUnicode_Kind kind;
2069 void *startptr, *endptr;
2070
2071 assert(PyUnicode_IS_READY(unicode));
2072 assert(0 <= start);
2073 assert(end <= PyUnicode_GET_LENGTH(unicode));
2074 assert(start <= end);
2075
2076 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2077 return PyUnicode_MAX_CHAR_VALUE(unicode);
2078
2079 if (start == end)
2080 return 127;
2081
Victor Stinner94d558b2012-04-27 22:26:58 +02002082 if (PyUnicode_IS_ASCII(unicode))
2083 return 127;
2084
Victor Stinnerece58de2012-04-23 23:36:38 +02002085 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002086 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002087 endptr = (char *)startptr + end * kind;
2088 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002089 switch(kind) {
2090 case PyUnicode_1BYTE_KIND:
2091 return ucs1lib_find_max_char(startptr, endptr);
2092 case PyUnicode_2BYTE_KIND:
2093 return ucs2lib_find_max_char(startptr, endptr);
2094 case PyUnicode_4BYTE_KIND:
2095 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002096 default:
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002097 assert(0);
2098 return 0;
Victor Stinnerece58de2012-04-23 23:36:38 +02002099 }
2100}
2101
Victor Stinner25a4b292011-10-06 12:31:55 +02002102/* Ensure that a string uses the most efficient storage, if it is not the
2103 case: create a new string with of the right kind. Write NULL into *p_unicode
2104 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002105static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002106unicode_adjust_maxchar(PyObject **p_unicode)
2107{
2108 PyObject *unicode, *copy;
2109 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002110 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002111 unsigned int kind;
2112
2113 assert(p_unicode != NULL);
2114 unicode = *p_unicode;
2115 assert(PyUnicode_IS_READY(unicode));
2116 if (PyUnicode_IS_ASCII(unicode))
2117 return;
2118
2119 len = PyUnicode_GET_LENGTH(unicode);
2120 kind = PyUnicode_KIND(unicode);
2121 if (kind == PyUnicode_1BYTE_KIND) {
2122 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002123 max_char = ucs1lib_find_max_char(u, u + len);
2124 if (max_char >= 128)
2125 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002126 }
2127 else if (kind == PyUnicode_2BYTE_KIND) {
2128 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002129 max_char = ucs2lib_find_max_char(u, u + len);
2130 if (max_char >= 256)
2131 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002132 }
2133 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002134 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002135 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002136 max_char = ucs4lib_find_max_char(u, u + len);
2137 if (max_char >= 0x10000)
2138 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002139 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002140 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002141 if (copy != NULL)
2142 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002143 Py_DECREF(unicode);
2144 *p_unicode = copy;
2145}
2146
Victor Stinner034f6cf2011-09-30 02:26:44 +02002147PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002148_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002149{
Victor Stinner87af4f22011-11-21 23:03:47 +01002150 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002151 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002152
Victor Stinner034f6cf2011-09-30 02:26:44 +02002153 if (!PyUnicode_Check(unicode)) {
2154 PyErr_BadInternalCall();
2155 return NULL;
2156 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002157 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002158 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002159
Victor Stinner87af4f22011-11-21 23:03:47 +01002160 length = PyUnicode_GET_LENGTH(unicode);
2161 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002162 if (!copy)
2163 return NULL;
2164 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2165
Victor Stinner87af4f22011-11-21 23:03:47 +01002166 Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2167 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002168 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002169 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002170}
2171
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002172
Victor Stinnerbc603d12011-10-02 01:00:40 +02002173/* Widen Unicode objects to larger buffers. Don't write terminating null
2174 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002175
2176void*
2177_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2178{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002179 Py_ssize_t len;
2180 void *result;
2181 unsigned int skind;
2182
Benjamin Petersonbac79492012-01-14 13:34:47 -05002183 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002184 return NULL;
2185
2186 len = PyUnicode_GET_LENGTH(s);
2187 skind = PyUnicode_KIND(s);
2188 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002189 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002190 return NULL;
2191 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002192 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002193 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002194 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002195 if (!result)
2196 return PyErr_NoMemory();
2197 assert(skind == PyUnicode_1BYTE_KIND);
2198 _PyUnicode_CONVERT_BYTES(
2199 Py_UCS1, Py_UCS2,
2200 PyUnicode_1BYTE_DATA(s),
2201 PyUnicode_1BYTE_DATA(s) + len,
2202 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002204 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002205 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002206 if (!result)
2207 return PyErr_NoMemory();
2208 if (skind == PyUnicode_2BYTE_KIND) {
2209 _PyUnicode_CONVERT_BYTES(
2210 Py_UCS2, Py_UCS4,
2211 PyUnicode_2BYTE_DATA(s),
2212 PyUnicode_2BYTE_DATA(s) + len,
2213 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002214 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002215 else {
2216 assert(skind == PyUnicode_1BYTE_KIND);
2217 _PyUnicode_CONVERT_BYTES(
2218 Py_UCS1, Py_UCS4,
2219 PyUnicode_1BYTE_DATA(s),
2220 PyUnicode_1BYTE_DATA(s) + len,
2221 result);
2222 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002224 default:
2225 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002226 }
Victor Stinner01698042011-10-04 00:04:26 +02002227 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002228 return NULL;
2229}
2230
2231static Py_UCS4*
2232as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2233 int copy_null)
2234{
2235 int kind;
2236 void *data;
2237 Py_ssize_t len, targetlen;
2238 if (PyUnicode_READY(string) == -1)
2239 return NULL;
2240 kind = PyUnicode_KIND(string);
2241 data = PyUnicode_DATA(string);
2242 len = PyUnicode_GET_LENGTH(string);
2243 targetlen = len;
2244 if (copy_null)
2245 targetlen++;
2246 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002247 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002248 if (!target) {
2249 PyErr_NoMemory();
2250 return NULL;
2251 }
2252 }
2253 else {
2254 if (targetsize < targetlen) {
2255 PyErr_Format(PyExc_SystemError,
2256 "string is longer than the buffer");
2257 if (copy_null && 0 < targetsize)
2258 target[0] = 0;
2259 return NULL;
2260 }
2261 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002262 if (kind == PyUnicode_1BYTE_KIND) {
2263 Py_UCS1 *start = (Py_UCS1 *) data;
2264 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002265 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002266 else if (kind == PyUnicode_2BYTE_KIND) {
2267 Py_UCS2 *start = (Py_UCS2 *) data;
2268 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2269 }
2270 else {
2271 assert(kind == PyUnicode_4BYTE_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002272 Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002273 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274 if (copy_null)
2275 target[len] = 0;
2276 return target;
2277}
2278
2279Py_UCS4*
2280PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2281 int copy_null)
2282{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002283 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002284 PyErr_BadInternalCall();
2285 return NULL;
2286 }
2287 return as_ucs4(string, target, targetsize, copy_null);
2288}
2289
2290Py_UCS4*
2291PyUnicode_AsUCS4Copy(PyObject *string)
2292{
2293 return as_ucs4(string, NULL, 0, 1);
2294}
2295
2296#ifdef HAVE_WCHAR_H
Mark Dickinson081dfee2009-03-18 14:47:41 +00002297
Alexander Belopolsky40018472011-02-26 01:02:56 +00002298PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002299PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002300{
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301 if (w == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002302 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02002303 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson29060642009-01-31 22:14:21 +00002304 PyErr_BadInternalCall();
2305 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002306 }
2307
Martin v. Löwis790465f2008-04-05 20:41:37 +00002308 if (size == -1) {
2309 size = wcslen(w);
2310 }
2311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002312 return PyUnicode_FromUnicode(w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002313}
2314
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002315#endif /* HAVE_WCHAR_H */
Mark Dickinson081dfee2009-03-18 14:47:41 +00002316
Victor Stinner15a11362012-10-06 23:48:20 +02002317/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002318 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2319 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2320#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002321
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002322static int
2323unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2324 Py_ssize_t width, Py_ssize_t precision)
2325{
2326 Py_ssize_t length, fill, arglen;
2327 Py_UCS4 maxchar;
2328
2329 if (PyUnicode_READY(str) == -1)
2330 return -1;
2331
2332 length = PyUnicode_GET_LENGTH(str);
2333 if ((precision == -1 || precision >= length)
2334 && width <= length)
2335 return _PyUnicodeWriter_WriteStr(writer, str);
2336
2337 if (precision != -1)
2338 length = Py_MIN(precision, length);
2339
2340 arglen = Py_MAX(length, width);
2341 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2342 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2343 else
2344 maxchar = writer->maxchar;
2345
2346 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2347 return -1;
2348
2349 if (width > length) {
2350 fill = width - length;
2351 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2352 return -1;
2353 writer->pos += fill;
2354 }
2355
2356 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2357 str, 0, length);
2358 writer->pos += length;
2359 return 0;
2360}
2361
2362static int
2363unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2364 Py_ssize_t width, Py_ssize_t precision)
2365{
2366 /* UTF-8 */
2367 Py_ssize_t length;
2368 PyObject *unicode;
2369 int res;
2370
2371 length = strlen(str);
2372 if (precision != -1)
2373 length = Py_MIN(length, precision);
2374 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2375 if (unicode == NULL)
2376 return -1;
2377
2378 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2379 Py_DECREF(unicode);
2380 return res;
2381}
2382
Victor Stinner96865452011-03-01 23:44:09 +00002383static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002384unicode_fromformat_arg(_PyUnicodeWriter *writer,
2385 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002386{
Victor Stinnere215d962012-10-06 23:03:36 +02002387 const char *p;
2388 Py_ssize_t len;
2389 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002390 Py_ssize_t width;
2391 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002392 int longflag;
2393 int longlongflag;
2394 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002395 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002396
2397 p = f;
2398 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002399 zeropad = 0;
2400 if (*f == '0') {
2401 zeropad = 1;
2402 f++;
2403 }
Victor Stinner96865452011-03-01 23:44:09 +00002404
2405 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002406 width = -1;
2407 if (Py_ISDIGIT((unsigned)*f)) {
2408 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002409 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002410 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002411 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002412 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002413 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002414 return NULL;
2415 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002416 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002417 f++;
2418 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002419 }
2420 precision = -1;
2421 if (*f == '.') {
2422 f++;
2423 if (Py_ISDIGIT((unsigned)*f)) {
2424 precision = (*f - '0');
2425 f++;
2426 while (Py_ISDIGIT((unsigned)*f)) {
2427 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2428 PyErr_SetString(PyExc_ValueError,
2429 "precision too big");
2430 return NULL;
2431 }
2432 precision = (precision * 10) + (*f - '0');
2433 f++;
2434 }
2435 }
Victor Stinner96865452011-03-01 23:44:09 +00002436 if (*f == '%') {
2437 /* "%.3%s" => f points to "3" */
2438 f--;
2439 }
2440 }
2441 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002442 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002443 f--;
2444 }
Victor Stinner96865452011-03-01 23:44:09 +00002445
2446 /* Handle %ld, %lu, %lld and %llu. */
2447 longflag = 0;
2448 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002449 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002450 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002451 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002452 longflag = 1;
2453 ++f;
2454 }
2455#ifdef HAVE_LONG_LONG
2456 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002457 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002458 longlongflag = 1;
2459 f += 2;
2460 }
2461#endif
2462 }
2463 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002464 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002465 size_tflag = 1;
2466 ++f;
2467 }
Victor Stinnere215d962012-10-06 23:03:36 +02002468
2469 if (f[1] == '\0')
2470 writer->overallocate = 0;
2471
2472 switch (*f) {
2473 case 'c':
2474 {
2475 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002476 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002477 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002478 "character argument not in range(0x110000)");
2479 return NULL;
2480 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002481 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002482 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002483 break;
2484 }
2485
2486 case 'i':
2487 case 'd':
2488 case 'u':
2489 case 'x':
2490 {
2491 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002492 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002493 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002494
2495 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002496 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002497 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002498 va_arg(*vargs, unsigned long));
2499#ifdef HAVE_LONG_LONG
2500 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002501 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002502 va_arg(*vargs, unsigned PY_LONG_LONG));
2503#endif
2504 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002505 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002506 va_arg(*vargs, size_t));
2507 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002508 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002509 va_arg(*vargs, unsigned int));
2510 }
2511 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002512 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002513 }
2514 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002515 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002516 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002517 va_arg(*vargs, long));
2518#ifdef HAVE_LONG_LONG
2519 else if (longlongflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002520 len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002521 va_arg(*vargs, PY_LONG_LONG));
2522#endif
2523 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002524 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002525 va_arg(*vargs, Py_ssize_t));
2526 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002527 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002528 va_arg(*vargs, int));
2529 }
2530 assert(len >= 0);
2531
Victor Stinnere215d962012-10-06 23:03:36 +02002532 if (precision < len)
2533 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002534
2535 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002536 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2537 return NULL;
2538
Victor Stinnere215d962012-10-06 23:03:36 +02002539 if (width > precision) {
2540 Py_UCS4 fillchar;
2541 fill = width - precision;
2542 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002543 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2544 return NULL;
2545 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002546 }
Victor Stinner15a11362012-10-06 23:48:20 +02002547 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002548 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002549 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2550 return NULL;
2551 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002552 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002553
Victor Stinner4a587072013-11-19 12:54:53 +01002554 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2555 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002556 break;
2557 }
2558
2559 case 'p':
2560 {
2561 char number[MAX_LONG_LONG_CHARS];
2562
2563 len = sprintf(number, "%p", va_arg(*vargs, void*));
2564 assert(len >= 0);
2565
2566 /* %p is ill-defined: ensure leading 0x. */
2567 if (number[1] == 'X')
2568 number[1] = 'x';
2569 else if (number[1] != 'x') {
2570 memmove(number + 2, number,
2571 strlen(number) + 1);
2572 number[0] = '0';
2573 number[1] = 'x';
2574 len += 2;
2575 }
2576
Victor Stinner4a587072013-11-19 12:54:53 +01002577 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002578 return NULL;
2579 break;
2580 }
2581
2582 case 's':
2583 {
2584 /* UTF-8 */
2585 const char *s = va_arg(*vargs, const char*);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002586 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002587 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002588 break;
2589 }
2590
2591 case 'U':
2592 {
2593 PyObject *obj = va_arg(*vargs, PyObject *);
2594 assert(obj && _PyUnicode_CHECK(obj));
2595
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002596 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002597 return NULL;
2598 break;
2599 }
2600
2601 case 'V':
2602 {
2603 PyObject *obj = va_arg(*vargs, PyObject *);
2604 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002605 if (obj) {
2606 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002607 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002608 return NULL;
2609 }
2610 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002611 assert(str != NULL);
2612 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002613 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002614 }
2615 break;
2616 }
2617
2618 case 'S':
2619 {
2620 PyObject *obj = va_arg(*vargs, PyObject *);
2621 PyObject *str;
2622 assert(obj);
2623 str = PyObject_Str(obj);
2624 if (!str)
2625 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002626 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002627 Py_DECREF(str);
2628 return NULL;
2629 }
2630 Py_DECREF(str);
2631 break;
2632 }
2633
2634 case 'R':
2635 {
2636 PyObject *obj = va_arg(*vargs, PyObject *);
2637 PyObject *repr;
2638 assert(obj);
2639 repr = PyObject_Repr(obj);
2640 if (!repr)
2641 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002642 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002643 Py_DECREF(repr);
2644 return NULL;
2645 }
2646 Py_DECREF(repr);
2647 break;
2648 }
2649
2650 case 'A':
2651 {
2652 PyObject *obj = va_arg(*vargs, PyObject *);
2653 PyObject *ascii;
2654 assert(obj);
2655 ascii = PyObject_ASCII(obj);
2656 if (!ascii)
2657 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002658 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002659 Py_DECREF(ascii);
2660 return NULL;
2661 }
2662 Py_DECREF(ascii);
2663 break;
2664 }
2665
2666 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002667 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002668 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002669 break;
2670
2671 default:
2672 /* if we stumble upon an unknown formatting code, copy the rest
2673 of the format string to the output string. (we cannot just
2674 skip the code, since there's no way to know what's in the
2675 argument list) */
2676 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002677 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002678 return NULL;
2679 f = p+len;
2680 return f;
2681 }
2682
2683 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002684 return f;
2685}
2686
Walter Dörwaldd2034312007-05-18 16:29:38 +00002687PyObject *
2688PyUnicode_FromFormatV(const char *format, va_list vargs)
2689{
Victor Stinnere215d962012-10-06 23:03:36 +02002690 va_list vargs2;
2691 const char *f;
2692 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002693
Victor Stinner8f674cc2013-04-17 23:02:17 +02002694 _PyUnicodeWriter_Init(&writer);
2695 writer.min_length = strlen(format) + 100;
2696 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002697
2698 /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2699 Copy it to be able to pass a reference to a subfunction. */
2700 Py_VA_COPY(vargs2, vargs);
2701
2702 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002703 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002704 f = unicode_fromformat_arg(&writer, f, &vargs2);
2705 if (f == NULL)
2706 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002707 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002708 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002709 const char *p;
2710 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002711
Victor Stinnere215d962012-10-06 23:03:36 +02002712 p = f;
2713 do
2714 {
2715 if ((unsigned char)*p > 127) {
2716 PyErr_Format(PyExc_ValueError,
2717 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2718 "string, got a non-ASCII byte: 0x%02x",
2719 (unsigned char)*p);
2720 return NULL;
2721 }
2722 p++;
2723 }
2724 while (*p != '\0' && *p != '%');
2725 len = p - f;
2726
2727 if (*p == '\0')
2728 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002729
2730 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002731 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002732
2733 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002734 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002735 }
Victor Stinnere215d962012-10-06 23:03:36 +02002736 return _PyUnicodeWriter_Finish(&writer);
2737
2738 fail:
2739 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002740 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002741}
2742
Walter Dörwaldd2034312007-05-18 16:29:38 +00002743PyObject *
2744PyUnicode_FromFormat(const char *format, ...)
2745{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002746 PyObject* ret;
2747 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002748
2749#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002750 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002751#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00002752 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002753#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00002754 ret = PyUnicode_FromFormatV(format, vargs);
2755 va_end(vargs);
2756 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002757}
2758
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002759#ifdef HAVE_WCHAR_H
2760
Victor Stinner5593d8a2010-10-02 11:11:27 +00002761/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2762 convert a Unicode object to a wide character string.
2763
Victor Stinnerd88d9832011-09-06 02:00:05 +02002764 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002765 character) required to convert the unicode object. Ignore size argument.
2766
Victor Stinnerd88d9832011-09-06 02:00:05 +02002767 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00002768 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02002769 the null character). */
Victor Stinner5593d8a2010-10-02 11:11:27 +00002770static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002771unicode_aswidechar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002772 wchar_t *w,
2773 Py_ssize_t size)
2774{
Victor Stinner5593d8a2010-10-02 11:11:27 +00002775 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002776 const wchar_t *wstr;
2777
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002778 wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002779 if (wstr == NULL)
2780 return -1;
2781
Victor Stinner5593d8a2010-10-02 11:11:27 +00002782 if (w != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00002783 if (size > res)
2784 size = res + 1;
2785 else
2786 res = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002787 Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
Victor Stinner5593d8a2010-10-02 11:11:27 +00002788 return res;
2789 }
2790 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002791 return res + 1;
Victor Stinner137c34c2010-09-29 10:25:54 +00002792}
2793
2794Py_ssize_t
Martin v. Löwis4d0d4712010-12-03 20:14:31 +00002795PyUnicode_AsWideChar(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002796 wchar_t *w,
2797 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798{
2799 if (unicode == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002800 PyErr_BadInternalCall();
2801 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002803 return unicode_aswidechar(unicode, w, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002804}
2805
Victor Stinner137c34c2010-09-29 10:25:54 +00002806wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00002807PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00002808 Py_ssize_t *size)
2809{
2810 wchar_t* buffer;
2811 Py_ssize_t buflen;
2812
2813 if (unicode == NULL) {
2814 PyErr_BadInternalCall();
2815 return NULL;
2816 }
2817
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002818 buflen = unicode_aswidechar(unicode, NULL, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002819 if (buflen == -1)
2820 return NULL;
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002821 buffer = PyMem_NEW(wchar_t, buflen);
Victor Stinner137c34c2010-09-29 10:25:54 +00002822 if (buffer == NULL) {
2823 PyErr_NoMemory();
2824 return NULL;
2825 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002826 buflen = unicode_aswidechar(unicode, buffer, buflen);
Stefan Krah8528c312012-08-19 21:52:43 +02002827 if (buflen == -1) {
2828 PyMem_FREE(buffer);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002829 return NULL;
Stefan Krah8528c312012-08-19 21:52:43 +02002830 }
Victor Stinner5593d8a2010-10-02 11:11:27 +00002831 if (size != NULL)
2832 *size = buflen;
Victor Stinner137c34c2010-09-29 10:25:54 +00002833 return buffer;
2834}
2835
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002836#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837
Alexander Belopolsky40018472011-02-26 01:02:56 +00002838PyObject *
2839PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002840{
Victor Stinner8faf8212011-12-08 22:14:11 +01002841 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002842 PyErr_SetString(PyExc_ValueError,
2843 "chr() arg not in range(0x110000)");
2844 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002845 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00002846
Victor Stinner985a82a2014-01-03 12:53:47 +01002847 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00002848}
2849
Alexander Belopolsky40018472011-02-26 01:02:56 +00002850PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002851PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002852{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002853 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00002854 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002855 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05002856 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02002857 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00002858 Py_INCREF(obj);
2859 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002860 }
2861 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002862 /* For a Unicode subtype that's not a Unicode object,
2863 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002864 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002865 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00002866 PyErr_Format(PyExc_TypeError,
2867 "Can't convert '%.100s' object to str implicitly",
Christian Heimes90aa7642007-12-19 02:45:37 +00002868 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00002869 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002870}
2871
Alexander Belopolsky40018472011-02-26 01:02:56 +00002872PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02002873PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002874 const char *encoding,
2875 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002876{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002877 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002878 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00002879
Guido van Rossumd57fd912000-03-10 22:53:23 +00002880 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002881 PyErr_BadInternalCall();
2882 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002883 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002884
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002885 /* Decoding bytes objects is the most common case and should be fast */
2886 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002887 if (PyBytes_GET_SIZE(obj) == 0)
2888 _Py_RETURN_UNICODE_EMPTY();
2889 v = PyUnicode_Decode(
2890 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2891 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002892 return v;
2893 }
2894
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002895 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00002896 PyErr_SetString(PyExc_TypeError,
2897 "decoding str is not supported");
2898 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002899 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00002900
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002901 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2902 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2903 PyErr_Format(PyExc_TypeError,
Serhiy Storchakab757c832014-12-05 22:25:22 +02002904 "coercing to str: need a bytes-like object, %.80s found",
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002905 Py_TYPE(obj)->tp_name);
2906 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00002907 }
Tim Petersced69f82003-09-16 20:30:58 +00002908
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002909 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02002910 PyBuffer_Release(&buffer);
2911 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00002912 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00002913
Serhiy Storchaka05997252013-01-26 12:14:02 +02002914 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00002915 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00002916 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002917}
2918
Victor Stinner600d3be2010-06-10 12:00:55 +00002919/* Convert encoding to lower case and replace '_' with '-' in order to
Victor Stinner37296e82010-06-10 13:36:23 +00002920 catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2921 1 on success. */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002922int
2923_Py_normalize_encoding(const char *encoding,
2924 char *lower,
2925 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002926{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002927 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00002928 char *l;
2929 char *l_end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002930
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002931 if (encoding == NULL) {
Victor Stinner66b32702013-11-07 23:12:23 +01002932 /* 6 == strlen("utf-8") + 1 */
Victor Stinnerdf23e302013-11-07 13:33:36 +01002933 if (lower_len < 6)
2934 return 0;
Benjamin Peterson7a6debe2011-10-15 09:25:28 -04002935 strcpy(lower, "utf-8");
2936 return 1;
2937 }
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002938 e = encoding;
2939 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00002940 l_end = &lower[lower_len - 1];
Victor Stinner37296e82010-06-10 13:36:23 +00002941 while (*e) {
2942 if (l == l_end)
2943 return 0;
David Malcolm96960882010-11-05 17:23:41 +00002944 if (Py_ISUPPER(*e)) {
2945 *l++ = Py_TOLOWER(*e++);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00002946 }
2947 else if (*e == '_') {
2948 *l++ = '-';
2949 e++;
2950 }
2951 else {
2952 *l++ = *e++;
2953 }
2954 }
2955 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00002956 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00002957}
2958
Alexander Belopolsky40018472011-02-26 01:02:56 +00002959PyObject *
2960PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03002961 Py_ssize_t size,
2962 const char *encoding,
2963 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00002964{
2965 PyObject *buffer = NULL, *unicode;
2966 Py_buffer info;
2967 char lower[11]; /* Enough for any encoding shortcut */
2968
Fred Drakee4315f52000-05-09 19:53:39 +00002969 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01002970 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002971 if ((strcmp(lower, "utf-8") == 0) ||
2972 (strcmp(lower, "utf8") == 0))
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002973 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
Victor Stinner37296e82010-06-10 13:36:23 +00002974 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00002975 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01002976 (strcmp(lower, "iso-8859-1") == 0) ||
2977 (strcmp(lower, "iso8859-1") == 0))
Victor Stinner37296e82010-06-10 13:36:23 +00002978 return PyUnicode_DecodeLatin1(s, size, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02002979#ifdef HAVE_MBCS
Victor Stinner37296e82010-06-10 13:36:23 +00002980 else if (strcmp(lower, "mbcs") == 0)
2981 return PyUnicode_DecodeMBCS(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002982#endif
Victor Stinner37296e82010-06-10 13:36:23 +00002983 else if (strcmp(lower, "ascii") == 0)
2984 return PyUnicode_DecodeASCII(s, size, errors);
2985 else if (strcmp(lower, "utf-16") == 0)
2986 return PyUnicode_DecodeUTF16(s, size, errors, 0);
2987 else if (strcmp(lower, "utf-32") == 0)
2988 return PyUnicode_DecodeUTF32(s, size, errors, 0);
2989 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002990
2991 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002992 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00002993 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00002994 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00002995 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002996 if (buffer == NULL)
2997 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10002998 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002999 if (unicode == NULL)
3000 goto onError;
3001 if (!PyUnicode_Check(unicode)) {
3002 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003003 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3004 "use codecs.decode() to decode to arbitrary types",
3005 encoding,
3006 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003007 Py_DECREF(unicode);
3008 goto onError;
3009 }
3010 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003011 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003012
Benjamin Peterson29060642009-01-31 22:14:21 +00003013 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003014 Py_XDECREF(buffer);
3015 return NULL;
3016}
3017
Alexander Belopolsky40018472011-02-26 01:02:56 +00003018PyObject *
3019PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003020 const char *encoding,
3021 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003022{
3023 PyObject *v;
3024
3025 if (!PyUnicode_Check(unicode)) {
3026 PyErr_BadArgument();
3027 goto onError;
3028 }
3029
3030 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003031 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003032
3033 /* Decode via the codec registry */
3034 v = PyCodec_Decode(unicode, encoding, errors);
3035 if (v == NULL)
3036 goto onError;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003037 return unicode_result(v);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003038
Benjamin Peterson29060642009-01-31 22:14:21 +00003039 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003040 return NULL;
3041}
3042
Alexander Belopolsky40018472011-02-26 01:02:56 +00003043PyObject *
3044PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003045 const char *encoding,
3046 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003047{
3048 PyObject *v;
3049
3050 if (!PyUnicode_Check(unicode)) {
3051 PyErr_BadArgument();
3052 goto onError;
3053 }
3054
3055 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003056 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003057
3058 /* Decode via the codec registry */
3059 v = PyCodec_Decode(unicode, encoding, errors);
3060 if (v == NULL)
3061 goto onError;
3062 if (!PyUnicode_Check(v)) {
3063 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003064 "'%.400s' decoder returned '%.400s' instead of 'str'; "
3065 "use codecs.decode() to decode to arbitrary types",
3066 encoding,
3067 Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003068 Py_DECREF(v);
3069 goto onError;
3070 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003071 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003072
Benjamin Peterson29060642009-01-31 22:14:21 +00003073 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003074 return NULL;
3075}
3076
Alexander Belopolsky40018472011-02-26 01:02:56 +00003077PyObject *
3078PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003079 Py_ssize_t size,
3080 const char *encoding,
3081 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003082{
3083 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003084
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085 unicode = PyUnicode_FromUnicode(s, size);
3086 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003087 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003088 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3089 Py_DECREF(unicode);
3090 return v;
3091}
3092
Alexander Belopolsky40018472011-02-26 01:02:56 +00003093PyObject *
3094PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003095 const char *encoding,
3096 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003097{
3098 PyObject *v;
3099
3100 if (!PyUnicode_Check(unicode)) {
3101 PyErr_BadArgument();
3102 goto onError;
3103 }
3104
3105 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003106 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003107
3108 /* Encode via the codec registry */
3109 v = PyCodec_Encode(unicode, encoding, errors);
3110 if (v == NULL)
3111 goto onError;
3112 return v;
3113
Benjamin Peterson29060642009-01-31 22:14:21 +00003114 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003115 return NULL;
3116}
3117
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003118static size_t
3119wcstombs_errorpos(const wchar_t *wstr)
3120{
3121 size_t len;
3122#if SIZEOF_WCHAR_T == 2
3123 wchar_t buf[3];
3124#else
3125 wchar_t buf[2];
3126#endif
3127 char outbuf[MB_LEN_MAX];
3128 const wchar_t *start, *previous;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003129
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003130#if SIZEOF_WCHAR_T == 2
3131 buf[2] = 0;
3132#else
3133 buf[1] = 0;
3134#endif
3135 start = wstr;
3136 while (*wstr != L'\0')
3137 {
3138 previous = wstr;
3139#if SIZEOF_WCHAR_T == 2
3140 if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3141 && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3142 {
3143 buf[0] = wstr[0];
3144 buf[1] = wstr[1];
3145 wstr += 2;
3146 }
3147 else {
3148 buf[0] = *wstr;
3149 buf[1] = 0;
3150 wstr++;
3151 }
3152#else
3153 buf[0] = *wstr;
3154 wstr++;
3155#endif
3156 len = wcstombs(outbuf, buf, sizeof(outbuf));
Victor Stinner2f197072011-12-17 07:08:30 +01003157 if (len == (size_t)-1)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003158 return previous - start;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003159 }
3160
3161 /* failed to find the unencodable character */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003162 return 0;
3163}
3164
Victor Stinner1b579672011-12-17 05:47:23 +01003165static int
3166locale_error_handler(const char *errors, int *surrogateescape)
3167{
3168 if (errors == NULL) {
3169 *surrogateescape = 0;
3170 return 0;
3171 }
3172
3173 if (strcmp(errors, "strict") == 0) {
3174 *surrogateescape = 0;
3175 return 0;
3176 }
Victor Stinner8dbd4212012-12-04 09:30:24 +01003177 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner1b579672011-12-17 05:47:23 +01003178 *surrogateescape = 1;
3179 return 0;
3180 }
3181 PyErr_Format(PyExc_ValueError,
3182 "only 'strict' and 'surrogateescape' error handlers "
3183 "are supported, not '%s'",
3184 errors);
3185 return -1;
3186}
3187
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003188PyObject *
Victor Stinner1b579672011-12-17 05:47:23 +01003189PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003190{
3191 Py_ssize_t wlen, wlen2;
3192 wchar_t *wstr;
3193 PyObject *bytes = NULL;
3194 char *errmsg;
Raymond Hettingere56666d2013-08-04 11:51:03 -07003195 PyObject *reason = NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003196 PyObject *exc;
3197 size_t error_pos;
Victor Stinner1b579672011-12-17 05:47:23 +01003198 int surrogateescape;
3199
3200 if (locale_error_handler(errors, &surrogateescape) < 0)
3201 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003202
3203 wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3204 if (wstr == NULL)
3205 return NULL;
3206
3207 wlen2 = wcslen(wstr);
3208 if (wlen2 != wlen) {
3209 PyMem_Free(wstr);
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003210 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003211 return NULL;
3212 }
3213
3214 if (surrogateescape) {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003215 /* "surrogateescape" error handler */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003216 char *str;
3217
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003218 str = Py_EncodeLocale(wstr, &error_pos);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003219 if (str == NULL) {
3220 if (error_pos == (size_t)-1) {
3221 PyErr_NoMemory();
3222 PyMem_Free(wstr);
3223 return NULL;
3224 }
3225 else {
3226 goto encode_error;
3227 }
3228 }
3229 PyMem_Free(wstr);
3230
3231 bytes = PyBytes_FromString(str);
3232 PyMem_Free(str);
3233 }
3234 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003235 /* strict mode */
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003236 size_t len, len2;
3237
3238 len = wcstombs(NULL, wstr, 0);
3239 if (len == (size_t)-1) {
Victor Stinner2f197072011-12-17 07:08:30 +01003240 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003241 goto encode_error;
3242 }
3243
3244 bytes = PyBytes_FromStringAndSize(NULL, len);
3245 if (bytes == NULL) {
3246 PyMem_Free(wstr);
3247 return NULL;
3248 }
3249
3250 len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3251 if (len2 == (size_t)-1 || len2 > len) {
Victor Stinner2f197072011-12-17 07:08:30 +01003252 error_pos = (size_t)-1;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003253 goto encode_error;
3254 }
3255 PyMem_Free(wstr);
3256 }
3257 return bytes;
3258
3259encode_error:
3260 errmsg = strerror(errno);
3261 assert(errmsg != NULL);
Victor Stinner2f197072011-12-17 07:08:30 +01003262
3263 if (error_pos == (size_t)-1)
3264 error_pos = wcstombs_errorpos(wstr);
3265
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003266 PyMem_Free(wstr);
3267 Py_XDECREF(bytes);
3268
Victor Stinner2f197072011-12-17 07:08:30 +01003269 if (errmsg != NULL) {
3270 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003271 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003272 if (wstr != NULL) {
3273 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003274 PyMem_RawFree(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003275 } else
3276 errmsg = NULL;
3277 }
3278 if (errmsg == NULL)
Victor Stinner1f33f2b2011-12-17 04:45:09 +01003279 reason = PyUnicode_FromString(
3280 "wcstombs() encountered an unencodable "
3281 "wide character");
3282 if (reason == NULL)
3283 return NULL;
3284
3285 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3286 "locale", unicode,
3287 (Py_ssize_t)error_pos,
3288 (Py_ssize_t)(error_pos+1),
3289 reason);
3290 Py_DECREF(reason);
3291 if (exc != NULL) {
3292 PyCodec_StrictErrors(exc);
3293 Py_XDECREF(exc);
3294 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003295 return NULL;
3296}
3297
Victor Stinnerad158722010-10-27 00:25:46 +00003298PyObject *
3299PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003300{
Victor Stinner99b95382011-07-04 14:23:54 +02003301#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003302 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003303#elif defined(__APPLE__)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003304 return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
Victor Stinnerad158722010-10-27 00:25:46 +00003305#else
Victor Stinner793b5312011-04-27 00:24:21 +02003306 PyInterpreterState *interp = PyThreadState_GET()->interp;
3307 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3308 cannot use it to encode and decode filenames before it is loaded. Load
3309 the Python codec requires to encode at least its own filename. Use the C
3310 version of the locale codec until the codec registry is initialized and
3311 the Python codec is loaded.
3312
3313 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3314 cannot only rely on it: check also interp->fscodec_initialized for
3315 subinterpreters. */
3316 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003317 return PyUnicode_AsEncodedString(unicode,
3318 Py_FileSystemDefaultEncoding,
3319 "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003320 }
3321 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003322 return PyUnicode_EncodeLocale(unicode, "surrogateescape");
Victor Stinnerc39211f2010-09-29 16:35:47 +00003323 }
Victor Stinnerad158722010-10-27 00:25:46 +00003324#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003325}
3326
Alexander Belopolsky40018472011-02-26 01:02:56 +00003327PyObject *
3328PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003329 const char *encoding,
3330 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003331{
3332 PyObject *v;
Victor Stinner600d3be2010-06-10 12:00:55 +00003333 char lower[11]; /* Enough for any encoding shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003334
Guido van Rossumd57fd912000-03-10 22:53:23 +00003335 if (!PyUnicode_Check(unicode)) {
3336 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003337 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003338 }
Fred Drakee4315f52000-05-09 19:53:39 +00003339
Fred Drakee4315f52000-05-09 19:53:39 +00003340 /* Shortcuts for common default encodings */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003341 if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003342 if ((strcmp(lower, "utf-8") == 0) ||
3343 (strcmp(lower, "utf8") == 0))
Victor Stinnera5c68c32011-03-02 01:03:14 +00003344 {
Victor Stinner2f283c22011-03-02 01:21:46 +00003345 if (errors == NULL || strcmp(errors, "strict") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003346 return _PyUnicode_AsUTF8String(unicode, NULL);
Victor Stinner2f283c22011-03-02 01:21:46 +00003347 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003348 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinnera5c68c32011-03-02 01:03:14 +00003349 }
Victor Stinner37296e82010-06-10 13:36:23 +00003350 else if ((strcmp(lower, "latin-1") == 0) ||
Alexander Belopolsky1d521462011-02-25 19:19:57 +00003351 (strcmp(lower, "latin1") == 0) ||
Victor Stinnerfa3ba4c2013-10-29 11:34:05 +01003352 (strcmp(lower, "iso-8859-1") == 0) ||
3353 (strcmp(lower, "iso8859-1") == 0))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003354 return _PyUnicode_AsLatin1String(unicode, errors);
Victor Stinner99b95382011-07-04 14:23:54 +02003355#ifdef HAVE_MBCS
Victor Stinnerac931b12011-11-20 18:27:03 +01003356 else if (strcmp(lower, "mbcs") == 0)
3357 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003358#endif
Victor Stinner37296e82010-06-10 13:36:23 +00003359 else if (strcmp(lower, "ascii") == 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003360 return _PyUnicode_AsASCIIString(unicode, errors);
Victor Stinner37296e82010-06-10 13:36:23 +00003361 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003362
3363 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003364 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003365 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003366 return NULL;
3367
3368 /* The normal path */
3369 if (PyBytes_Check(v))
3370 return v;
3371
3372 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003373 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003374 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003375 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003376
3377 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003378 "encoder %s returned bytearray instead of bytes; "
3379 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003380 encoding);
3381 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003382 Py_DECREF(v);
3383 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003384 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003385
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003386 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3387 Py_DECREF(v);
3388 return b;
3389 }
3390
3391 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003392 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3393 "use codecs.encode() to encode to arbitrary types",
3394 encoding,
3395 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003396 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003397 return NULL;
3398}
3399
Alexander Belopolsky40018472011-02-26 01:02:56 +00003400PyObject *
3401PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003402 const char *encoding,
3403 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003404{
3405 PyObject *v;
3406
3407 if (!PyUnicode_Check(unicode)) {
3408 PyErr_BadArgument();
3409 goto onError;
3410 }
3411
3412 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003413 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003414
3415 /* Encode via the codec registry */
3416 v = PyCodec_Encode(unicode, encoding, errors);
3417 if (v == NULL)
3418 goto onError;
3419 if (!PyUnicode_Check(v)) {
3420 PyErr_Format(PyExc_TypeError,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003421 "'%.400s' encoder returned '%.400s' instead of 'str'; "
3422 "use codecs.encode() to encode to arbitrary types",
3423 encoding,
3424 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003425 Py_DECREF(v);
3426 goto onError;
3427 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003428 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003429
Benjamin Peterson29060642009-01-31 22:14:21 +00003430 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003431 return NULL;
3432}
3433
Victor Stinner2f197072011-12-17 07:08:30 +01003434static size_t
3435mbstowcs_errorpos(const char *str, size_t len)
3436{
3437#ifdef HAVE_MBRTOWC
3438 const char *start = str;
3439 mbstate_t mbs;
3440 size_t converted;
3441 wchar_t ch;
3442
3443 memset(&mbs, 0, sizeof mbs);
3444 while (len)
3445 {
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03003446 converted = mbrtowc(&ch, str, len, &mbs);
Victor Stinner2f197072011-12-17 07:08:30 +01003447 if (converted == 0)
3448 /* Reached end of string */
3449 break;
3450 if (converted == (size_t)-1 || converted == (size_t)-2) {
3451 /* Conversion error or incomplete character */
3452 return str - start;
3453 }
3454 else {
3455 str += converted;
3456 len -= converted;
3457 }
3458 }
3459 /* failed to find the undecodable byte sequence */
3460 return 0;
3461#endif
3462 return 0;
3463}
3464
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003465PyObject*
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003466PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
Victor Stinner1b579672011-12-17 05:47:23 +01003467 const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003468{
3469 wchar_t smallbuf[256];
3470 size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3471 wchar_t *wstr;
3472 size_t wlen, wlen2;
3473 PyObject *unicode;
Victor Stinner1b579672011-12-17 05:47:23 +01003474 int surrogateescape;
Victor Stinner2f197072011-12-17 07:08:30 +01003475 size_t error_pos;
3476 char *errmsg;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01003477 PyObject *reason = NULL; /* initialize to prevent gcc warning */
3478 PyObject *exc;
Victor Stinner1b579672011-12-17 05:47:23 +01003479
3480 if (locale_error_handler(errors, &surrogateescape) < 0)
3481 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003482
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003483 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3484 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003485 return NULL;
3486 }
3487
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003488 if (surrogateescape) {
3489 /* "surrogateescape" error handler */
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003490 wstr = Py_DecodeLocale(str, &wlen);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003491 if (wstr == NULL) {
3492 if (wlen == (size_t)-1)
3493 PyErr_NoMemory();
3494 else
3495 PyErr_SetFromErrno(PyExc_OSError);
3496 return NULL;
3497 }
3498
3499 unicode = PyUnicode_FromWideChar(wstr, wlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003500 PyMem_RawFree(wstr);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003501 }
3502 else {
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003503 /* strict mode */
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003504#ifndef HAVE_BROKEN_MBSTOWCS
3505 wlen = mbstowcs(NULL, str, 0);
3506#else
3507 wlen = len;
3508#endif
Victor Stinner2f197072011-12-17 07:08:30 +01003509 if (wlen == (size_t)-1)
3510 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003511 if (wlen+1 <= smallbuf_len) {
3512 wstr = smallbuf;
3513 }
3514 else {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02003515 wstr = PyMem_New(wchar_t, wlen+1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003516 if (!wstr)
3517 return PyErr_NoMemory();
3518 }
3519
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003520 wlen2 = mbstowcs(wstr, str, wlen+1);
3521 if (wlen2 == (size_t)-1) {
3522 if (wstr != smallbuf)
3523 PyMem_Free(wstr);
Victor Stinner2f197072011-12-17 07:08:30 +01003524 goto decode_error;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003525 }
3526#ifdef HAVE_BROKEN_MBSTOWCS
3527 assert(wlen2 == wlen);
3528#endif
3529 unicode = PyUnicode_FromWideChar(wstr, wlen2);
3530 if (wstr != smallbuf)
3531 PyMem_Free(wstr);
3532 }
3533 return unicode;
Victor Stinner2f197072011-12-17 07:08:30 +01003534
3535decode_error:
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003536 reason = NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003537 errmsg = strerror(errno);
3538 assert(errmsg != NULL);
3539
3540 error_pos = mbstowcs_errorpos(str, len);
3541 if (errmsg != NULL) {
3542 size_t errlen;
Victor Stinnerf6a271a2014-08-01 12:28:48 +02003543 wstr = Py_DecodeLocale(errmsg, &errlen);
Victor Stinner2f197072011-12-17 07:08:30 +01003544 if (wstr != NULL) {
3545 reason = PyUnicode_FromWideChar(wstr, errlen);
Victor Stinner1a7425f2013-07-07 16:25:15 +02003546 PyMem_RawFree(wstr);
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003547 }
Victor Stinner2f197072011-12-17 07:08:30 +01003548 }
Antoine Pitrouf6d1f1f2015-05-19 21:04:33 +02003549 if (reason == NULL)
Victor Stinner2f197072011-12-17 07:08:30 +01003550 reason = PyUnicode_FromString(
3551 "mbstowcs() encountered an invalid multibyte sequence");
3552 if (reason == NULL)
3553 return NULL;
3554
3555 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3556 "locale", str, len,
3557 (Py_ssize_t)error_pos,
3558 (Py_ssize_t)(error_pos+1),
3559 reason);
3560 Py_DECREF(reason);
3561 if (exc != NULL) {
3562 PyCodec_StrictErrors(exc);
3563 Py_XDECREF(exc);
3564 }
3565 return NULL;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003566}
3567
3568PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003569PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003570{
3571 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner1b579672011-12-17 05:47:23 +01003572 return PyUnicode_DecodeLocaleAndSize(str, size, errors);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003573}
3574
3575
3576PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003577PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003578 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003579 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3580}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003581
Christian Heimes5894ba72007-11-04 11:43:14 +00003582PyObject*
3583PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3584{
Victor Stinner99b95382011-07-04 14:23:54 +02003585#ifdef HAVE_MBCS
Victor Stinnerad158722010-10-27 00:25:46 +00003586 return PyUnicode_DecodeMBCS(s, size, NULL);
3587#elif defined(__APPLE__)
Victor Stinnera1d12bb2011-12-11 21:53:09 +01003588 return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
Victor Stinnerad158722010-10-27 00:25:46 +00003589#else
Victor Stinner793b5312011-04-27 00:24:21 +02003590 PyInterpreterState *interp = PyThreadState_GET()->interp;
3591 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3592 cannot use it to encode and decode filenames before it is loaded. Load
3593 the Python codec requires to encode at least its own filename. Use the C
3594 version of the locale codec until the codec registry is initialized and
3595 the Python codec is loaded.
3596
3597 Py_FileSystemDefaultEncoding is shared between all interpreters, we
3598 cannot only rely on it: check also interp->fscodec_initialized for
3599 subinterpreters. */
3600 if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003601 return PyUnicode_Decode(s, size,
3602 Py_FileSystemDefaultEncoding,
Victor Stinnerb9a20ad2010-04-30 16:37:52 +00003603 "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003604 }
3605 else {
Victor Stinner1b579672011-12-17 05:47:23 +01003606 return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003607 }
Victor Stinnerad158722010-10-27 00:25:46 +00003608#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003609}
3610
Martin v. Löwis011e8422009-05-05 04:43:17 +00003611
3612int
3613PyUnicode_FSConverter(PyObject* arg, void* addr)
3614{
3615 PyObject *output = NULL;
3616 Py_ssize_t size;
3617 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003618 if (arg == NULL) {
3619 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003620 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003621 return 1;
3622 }
Victor Stinnerdcb24032010-04-22 12:08:36 +00003623 if (PyBytes_Check(arg)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00003624 output = arg;
3625 Py_INCREF(output);
3626 }
3627 else {
3628 arg = PyUnicode_FromObject(arg);
3629 if (!arg)
3630 return 0;
Victor Stinnerae6265f2010-05-15 16:27:27 +00003631 output = PyUnicode_EncodeFSDefault(arg);
Martin v. Löwis011e8422009-05-05 04:43:17 +00003632 Py_DECREF(arg);
3633 if (!output)
3634 return 0;
3635 if (!PyBytes_Check(output)) {
3636 Py_DECREF(output);
3637 PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3638 return 0;
3639 }
3640 }
Victor Stinner0ea2a462010-04-30 00:22:08 +00003641 size = PyBytes_GET_SIZE(output);
3642 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003643 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003644 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003645 Py_DECREF(output);
3646 return 0;
3647 }
3648 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003649 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003650}
3651
3652
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003653int
3654PyUnicode_FSDecoder(PyObject* arg, void* addr)
3655{
3656 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003657 if (arg == NULL) {
3658 Py_DECREF(*(PyObject**)addr);
3659 return 1;
3660 }
3661 if (PyUnicode_Check(arg)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003662 if (PyUnicode_READY(arg) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003663 return 0;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003664 output = arg;
3665 Py_INCREF(output);
3666 }
3667 else {
3668 arg = PyBytes_FromObject(arg);
3669 if (!arg)
3670 return 0;
3671 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3672 PyBytes_GET_SIZE(arg));
3673 Py_DECREF(arg);
3674 if (!output)
3675 return 0;
3676 if (!PyUnicode_Check(output)) {
3677 Py_DECREF(output);
3678 PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3679 return 0;
3680 }
3681 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003682 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003683 Py_DECREF(output);
3684 return 0;
3685 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003686 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003687 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003688 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003689 Py_DECREF(output);
3690 return 0;
3691 }
3692 *(PyObject**)addr = output;
3693 return Py_CLEANUP_SUPPORTED;
3694}
3695
3696
Martin v. Löwis5b222132007-06-10 09:51:05 +00003697char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003698PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003699{
Christian Heimesf3863112007-11-22 07:46:41 +00003700 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003701
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003702 if (!PyUnicode_Check(unicode)) {
3703 PyErr_BadArgument();
3704 return NULL;
3705 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003706 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003707 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003708
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003709 if (PyUnicode_UTF8(unicode) == NULL) {
3710 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003711 bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3712 if (bytes == NULL)
3713 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003714 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3715 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003716 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003717 Py_DECREF(bytes);
3718 return NULL;
3719 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003720 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3721 Py_MEMCPY(_PyUnicode_UTF8(unicode),
3722 PyBytes_AS_STRING(bytes),
3723 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003724 Py_DECREF(bytes);
3725 }
3726
3727 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003728 *psize = PyUnicode_UTF8_LENGTH(unicode);
3729 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003730}
3731
3732char*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003733PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003734{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003735 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3736}
3737
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003738Py_UNICODE *
3739PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3740{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003741 const unsigned char *one_byte;
3742#if SIZEOF_WCHAR_T == 4
3743 const Py_UCS2 *two_bytes;
3744#else
3745 const Py_UCS4 *four_bytes;
3746 const Py_UCS4 *ucs4_end;
3747 Py_ssize_t num_surrogates;
3748#endif
3749 wchar_t *w;
3750 wchar_t *wchar_end;
3751
3752 if (!PyUnicode_Check(unicode)) {
3753 PyErr_BadArgument();
3754 return NULL;
3755 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003756 if (_PyUnicode_WSTR(unicode) == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003757 /* Non-ASCII compact unicode object */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003758 assert(_PyUnicode_KIND(unicode) != 0);
3759 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003760
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003761 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003762#if SIZEOF_WCHAR_T == 2
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003763 four_bytes = PyUnicode_4BYTE_DATA(unicode);
3764 ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003765 num_surrogates = 0;
3766
3767 for (; four_bytes < ucs4_end; ++four_bytes) {
3768 if (*four_bytes > 0xFFFF)
3769 ++num_surrogates;
3770 }
3771
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003772 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3773 sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3774 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003775 PyErr_NoMemory();
3776 return NULL;
3777 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003778 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003779
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003780 w = _PyUnicode_WSTR(unicode);
3781 wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3782 four_bytes = PyUnicode_4BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003783 for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3784 if (*four_bytes > 0xFFFF) {
Victor Stinner8faf8212011-12-08 22:14:11 +01003785 assert(*four_bytes <= MAX_UNICODE);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003786 /* encode surrogate pair in this case */
Victor Stinner551ac952011-11-29 22:58:13 +01003787 *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3788 *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003789 }
3790 else
3791 *w = *four_bytes;
3792
3793 if (w > wchar_end) {
3794 assert(0 && "Miscalculated string end");
3795 }
3796 }
3797 *w = 0;
3798#else
3799 /* sizeof(wchar_t) == 4 */
3800 Py_FatalError("Impossible unicode object state, wstr and str "
3801 "should share memory already.");
3802 return NULL;
3803#endif
3804 }
3805 else {
Serhiy Storchakae55181f2015-02-20 21:34:06 +02003806 if ((size_t)_PyUnicode_LENGTH(unicode) >
3807 PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3808 PyErr_NoMemory();
3809 return NULL;
3810 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003811 _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3812 (_PyUnicode_LENGTH(unicode) + 1));
3813 if (!_PyUnicode_WSTR(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003814 PyErr_NoMemory();
3815 return NULL;
3816 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003817 if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3818 _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3819 w = _PyUnicode_WSTR(unicode);
3820 wchar_end = w + _PyUnicode_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003821
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003822 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3823 one_byte = PyUnicode_1BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003824 for (; w < wchar_end; ++one_byte, ++w)
3825 *w = *one_byte;
3826 /* null-terminate the wstr */
3827 *w = 0;
3828 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003829 else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003830#if SIZEOF_WCHAR_T == 4
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003831 two_bytes = PyUnicode_2BYTE_DATA(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003832 for (; w < wchar_end; ++two_bytes, ++w)
3833 *w = *two_bytes;
3834 /* null-terminate the wstr */
3835 *w = 0;
3836#else
3837 /* sizeof(wchar_t) == 2 */
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003838 PyObject_FREE(_PyUnicode_WSTR(unicode));
3839 _PyUnicode_WSTR(unicode) = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003840 Py_FatalError("Impossible unicode object state, wstr "
3841 "and str should share memory already.");
3842 return NULL;
3843#endif
3844 }
3845 else {
3846 assert(0 && "This should never happen.");
3847 }
3848 }
3849 }
3850 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003851 *size = PyUnicode_WSTR_LENGTH(unicode);
3852 return _PyUnicode_WSTR(unicode);
Martin v. Löwis5b222132007-06-10 09:51:05 +00003853}
3854
Alexander Belopolsky40018472011-02-26 01:02:56 +00003855Py_UNICODE *
3856PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003857{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003858 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003859}
3860
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003861
Alexander Belopolsky40018472011-02-26 01:02:56 +00003862Py_ssize_t
3863PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003864{
3865 if (!PyUnicode_Check(unicode)) {
3866 PyErr_BadArgument();
3867 goto onError;
3868 }
3869 return PyUnicode_GET_SIZE(unicode);
3870
Benjamin Peterson29060642009-01-31 22:14:21 +00003871 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003872 return -1;
3873}
3874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003875Py_ssize_t
3876PyUnicode_GetLength(PyObject *unicode)
3877{
Victor Stinner07621332012-06-16 04:53:46 +02003878 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003879 PyErr_BadArgument();
3880 return -1;
3881 }
Victor Stinner07621332012-06-16 04:53:46 +02003882 if (PyUnicode_READY(unicode) == -1)
3883 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003884 return PyUnicode_GET_LENGTH(unicode);
3885}
3886
3887Py_UCS4
3888PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3889{
Victor Stinner69ed0f42013-04-09 21:48:24 +02003890 void *data;
3891 int kind;
3892
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003893 if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3894 PyErr_BadArgument();
3895 return (Py_UCS4)-1;
3896 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01003897 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02003898 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003899 return (Py_UCS4)-1;
3900 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02003901 data = PyUnicode_DATA(unicode);
3902 kind = PyUnicode_KIND(unicode);
3903 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003904}
3905
3906int
3907PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3908{
3909 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003910 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003911 return -1;
3912 }
Victor Stinner488fa492011-12-12 00:01:39 +01003913 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01003914 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02003915 PyErr_SetString(PyExc_IndexError, "string index out of range");
3916 return -1;
3917 }
Victor Stinner488fa492011-12-12 00:01:39 +01003918 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02003919 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01003920 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3921 PyErr_SetString(PyExc_ValueError, "character out of range");
3922 return -1;
3923 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003924 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3925 index, ch);
3926 return 0;
3927}
3928
Alexander Belopolsky40018472011-02-26 01:02:56 +00003929const char *
3930PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00003931{
Victor Stinner42cb4622010-09-01 19:39:01 +00003932 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00003933}
3934
Victor Stinner554f3f02010-06-16 23:33:54 +00003935/* create or adjust a UnicodeDecodeError */
3936static void
3937make_decode_exception(PyObject **exceptionObject,
3938 const char *encoding,
3939 const char *input, Py_ssize_t length,
3940 Py_ssize_t startpos, Py_ssize_t endpos,
3941 const char *reason)
3942{
3943 if (*exceptionObject == NULL) {
3944 *exceptionObject = PyUnicodeDecodeError_Create(
3945 encoding, input, length, startpos, endpos, reason);
3946 }
3947 else {
3948 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3949 goto onError;
3950 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3951 goto onError;
3952 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3953 goto onError;
3954 }
3955 return;
3956
3957onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02003958 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00003959}
3960
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003961#ifdef HAVE_MBCS
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003962/* error handling callback helper:
3963 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00003964 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003965 and adjust various state variables.
3966 return 0 on success, -1 on error
3967*/
3968
Alexander Belopolsky40018472011-02-26 01:02:56 +00003969static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003970unicode_decode_call_errorhandler_wchar(
3971 const char *errors, PyObject **errorHandler,
3972 const char *encoding, const char *reason,
3973 const char **input, const char **inend, Py_ssize_t *startinpos,
3974 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3975 PyObject **output, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003976{
Benjamin Peterson142957c2008-07-04 19:55:29 +00003977 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003978
3979 PyObject *restuple = NULL;
3980 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01003981 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003982 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00003983 Py_ssize_t requiredsize;
3984 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00003985 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003986 wchar_t *repwstr;
3987 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003988
Victor Stinnerfc009ef2012-11-07 00:36:38 +01003989 assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
3990 outsize = _PyUnicode_WSTR_LENGTH(*output);
Victor Stinner596a6c42011-11-09 00:02:18 +01003991
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003992 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003993 *errorHandler = PyCodec_LookupError(errors);
3994 if (*errorHandler == NULL)
3995 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003996 }
3997
Victor Stinner554f3f02010-06-16 23:33:54 +00003998 make_decode_exception(exceptionObject,
3999 encoding,
4000 *input, *inend - *input,
4001 *startinpos, *endinpos,
4002 reason);
4003 if (*exceptionObject == NULL)
4004 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004005
4006 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4007 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004008 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004009 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00004010 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004011 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004012 }
4013 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004014 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004015
4016 /* Copy back the bytes variables, which might have been modified by the
4017 callback */
4018 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4019 if (!inputobj)
4020 goto onError;
4021 if (!PyBytes_Check(inputobj)) {
4022 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4023 }
4024 *input = PyBytes_AS_STRING(inputobj);
4025 insize = PyBytes_GET_SIZE(inputobj);
4026 *inend = *input + insize;
4027 /* we can DECREF safely, as the exception has another reference,
4028 so the object won't go away. */
4029 Py_DECREF(inputobj);
4030
4031 if (newpos<0)
4032 newpos = insize+newpos;
4033 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004034 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004035 goto onError;
4036 }
4037
4038 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4039 if (repwstr == NULL)
4040 goto onError;
4041 /* need more space? (at least enough for what we
4042 have+the replacement+the rest of the string (starting
4043 at the new input position), so we won't have to check space
4044 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004045 requiredsize = *outpos;
4046 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4047 goto overflow;
4048 requiredsize += repwlen;
4049 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4050 goto overflow;
4051 requiredsize += insize - newpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004052 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004053 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004054 requiredsize = 2*outsize;
4055 if (unicode_resize(output, requiredsize) < 0)
4056 goto onError;
4057 }
4058 wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4059 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004060 *endinpos = newpos;
4061 *inptr = *input + newpos;
4062
4063 /* we made it! */
4064 Py_XDECREF(restuple);
4065 return 0;
4066
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004067 overflow:
4068 PyErr_SetString(PyExc_OverflowError,
4069 "decoded result is too long for a Python string");
4070
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004071 onError:
4072 Py_XDECREF(restuple);
4073 return -1;
4074}
4075#endif /* HAVE_MBCS */
4076
4077static int
4078unicode_decode_call_errorhandler_writer(
4079 const char *errors, PyObject **errorHandler,
4080 const char *encoding, const char *reason,
4081 const char **input, const char **inend, Py_ssize_t *startinpos,
4082 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4083 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4084{
4085 static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4086
4087 PyObject *restuple = NULL;
4088 PyObject *repunicode = NULL;
4089 Py_ssize_t insize;
4090 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004091 Py_ssize_t replen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004092 PyObject *inputobj = NULL;
4093
4094 if (*errorHandler == NULL) {
4095 *errorHandler = PyCodec_LookupError(errors);
4096 if (*errorHandler == NULL)
4097 goto onError;
4098 }
4099
4100 make_decode_exception(exceptionObject,
4101 encoding,
4102 *input, *inend - *input,
4103 *startinpos, *endinpos,
4104 reason);
4105 if (*exceptionObject == NULL)
4106 goto onError;
4107
4108 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4109 if (restuple == NULL)
4110 goto onError;
4111 if (!PyTuple_Check(restuple)) {
4112 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4113 goto onError;
4114 }
4115 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004116 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004117
4118 /* Copy back the bytes variables, which might have been modified by the
4119 callback */
4120 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4121 if (!inputobj)
4122 goto onError;
Christian Heimes72b710a2008-05-26 13:28:38 +00004123 if (!PyBytes_Check(inputobj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004124 PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
Walter Dörwalde78178e2007-07-30 13:31:40 +00004125 }
Christian Heimes72b710a2008-05-26 13:28:38 +00004126 *input = PyBytes_AS_STRING(inputobj);
4127 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004128 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004129 /* we can DECREF safely, as the exception has another reference,
4130 so the object won't go away. */
4131 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004132
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004133 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004134 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004135 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004136 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004137 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004138 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004139
Victor Stinner8f674cc2013-04-17 23:02:17 +02004140 if (PyUnicode_READY(repunicode) < 0)
4141 goto onError;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004142 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004143 if (replen > 1) {
4144 writer->min_length += replen - 1;
Victor Stinner8f674cc2013-04-17 23:02:17 +02004145 writer->overallocate = 1;
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004146 if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4147 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4148 goto onError;
4149 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004150 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004151 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004152
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004153 *endinpos = newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004154 *inptr = *input + newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004155
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004156 /* we made it! */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004157 Py_XDECREF(restuple);
4158 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004159
Benjamin Peterson29060642009-01-31 22:14:21 +00004160 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004161 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004162 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004163}
4164
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004165/* --- UTF-7 Codec -------------------------------------------------------- */
4166
Antoine Pitrou244651a2009-05-04 18:56:13 +00004167/* See RFC2152 for details. We encode conservatively and decode liberally. */
4168
4169/* Three simple macros defining base-64. */
4170
4171/* Is c a base-64 character? */
4172
4173#define IS_BASE64(c) \
4174 (((c) >= 'A' && (c) <= 'Z') || \
4175 ((c) >= 'a' && (c) <= 'z') || \
4176 ((c) >= '0' && (c) <= '9') || \
4177 (c) == '+' || (c) == '/')
4178
4179/* given that c is a base-64 character, what is its base-64 value? */
4180
4181#define FROM_BASE64(c) \
4182 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4183 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4184 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4185 (c) == '+' ? 62 : 63)
4186
4187/* What is the base-64 character of the bottom 6 bits of n? */
4188
4189#define TO_BASE64(n) \
4190 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4191
4192/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4193 * decoded as itself. We are permissive on decoding; the only ASCII
4194 * byte not decoding to itself is the + which begins a base64
4195 * string. */
4196
4197#define DECODE_DIRECT(c) \
4198 ((c) <= 127 && (c) != '+')
4199
4200/* The UTF-7 encoder treats ASCII characters differently according to
4201 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4202 * the above). See RFC2152. This array identifies these different
4203 * sets:
4204 * 0 : "Set D"
4205 * alphanumeric and '(),-./:?
4206 * 1 : "Set O"
4207 * !"#$%&*;<=>@[]^_`{|}
4208 * 2 : "whitespace"
4209 * ht nl cr sp
4210 * 3 : special (must be base64 encoded)
4211 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4212 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004213
Tim Petersced69f82003-09-16 20:30:58 +00004214static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004215char utf7_category[128] = {
4216/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4217 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4218/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4219 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4220/* sp ! " # $ % & ' ( ) * + , - . / */
4221 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4222/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4223 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4224/* @ A B C D E F G H I J K L M N O */
4225 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4226/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4227 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4228/* ` a b c d e f g h i j k l m n o */
4229 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4230/* p q r s t u v w x y z { | } ~ del */
4231 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004232};
4233
Antoine Pitrou244651a2009-05-04 18:56:13 +00004234/* ENCODE_DIRECT: this character should be encoded as itself. The
4235 * answer depends on whether we are encoding set O as itself, and also
4236 * on whether we are encoding whitespace as itself. RFC2152 makes it
4237 * clear that the answers to these questions vary between
4238 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004239
Antoine Pitrou244651a2009-05-04 18:56:13 +00004240#define ENCODE_DIRECT(c, directO, directWS) \
4241 ((c) < 128 && (c) > 0 && \
4242 ((utf7_category[(c)] == 0) || \
4243 (directWS && (utf7_category[(c)] == 2)) || \
4244 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004245
Alexander Belopolsky40018472011-02-26 01:02:56 +00004246PyObject *
4247PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004248 Py_ssize_t size,
4249 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004250{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004251 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4252}
4253
Antoine Pitrou244651a2009-05-04 18:56:13 +00004254/* The decoder. The only state we preserve is our read position,
4255 * i.e. how many characters we have consumed. So if we end in the
4256 * middle of a shift sequence we have to back off the read position
4257 * and the output to the beginning of the sequence, otherwise we lose
4258 * all the shift state (seen bits, number of bits seen, high
4259 * surrogate). */
4260
Alexander Belopolsky40018472011-02-26 01:02:56 +00004261PyObject *
4262PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004263 Py_ssize_t size,
4264 const char *errors,
4265 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004266{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004267 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004268 Py_ssize_t startinpos;
4269 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004270 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004271 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004272 const char *errmsg = "";
4273 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004274 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004275 unsigned int base64bits = 0;
4276 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004277 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004278 PyObject *errorHandler = NULL;
4279 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004280
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004281 if (size == 0) {
4282 if (consumed)
4283 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004284 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004285 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004286
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004287 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004288 _PyUnicodeWriter_Init(&writer);
4289 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004290
4291 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004292 e = s + size;
4293
4294 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004295 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004296 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004297 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004298
Antoine Pitrou244651a2009-05-04 18:56:13 +00004299 if (inShift) { /* in a base-64 section */
4300 if (IS_BASE64(ch)) { /* consume a base-64 character */
4301 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4302 base64bits += 6;
4303 s++;
4304 if (base64bits >= 16) {
4305 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004306 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004307 base64bits -= 16;
4308 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004309 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004310 if (surrogate) {
4311 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004312 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4313 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004314 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004315 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004316 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004317 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004318 }
4319 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004320 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004321 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004322 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004323 }
4324 }
Victor Stinner551ac952011-11-29 22:58:13 +01004325 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004326 /* first surrogate */
4327 surrogate = outCh;
4328 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004329 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004330 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004331 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004332 }
4333 }
4334 }
4335 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004336 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004337 if (base64bits > 0) { /* left-over bits */
4338 if (base64bits >= 6) {
4339 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004340 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004341 errmsg = "partial character in shift sequence";
4342 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004343 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004344 else {
4345 /* Some bits remain; they should be zero */
4346 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004347 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004348 errmsg = "non-zero padding bits in shift sequence";
4349 goto utf7Error;
4350 }
4351 }
4352 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004353 if (surrogate && DECODE_DIRECT(ch)) {
4354 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4355 goto onError;
4356 }
4357 surrogate = 0;
4358 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004359 /* '-' is absorbed; other terminating
4360 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004361 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004362 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004363 }
4364 }
4365 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004366 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004367 s++; /* consume '+' */
4368 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004369 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004370 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004371 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004372 }
4373 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004374 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004375 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004376 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004377 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004378 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004379 }
4380 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004381 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004382 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004383 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004384 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004385 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004386 else {
4387 startinpos = s-starts;
4388 s++;
4389 errmsg = "unexpected special character";
4390 goto utf7Error;
4391 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004392 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004393utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004394 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004395 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004396 errors, &errorHandler,
4397 "utf7", errmsg,
4398 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004399 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004400 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004401 }
4402
Antoine Pitrou244651a2009-05-04 18:56:13 +00004403 /* end of string */
4404
4405 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4406 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004407 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004408 if (surrogate ||
4409 (base64bits >= 6) ||
4410 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004411 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004412 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004413 errors, &errorHandler,
4414 "utf7", "unterminated shift sequence",
4415 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004416 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004417 goto onError;
4418 if (s < e)
4419 goto restart;
4420 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004421 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004422
4423 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004424 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004425 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004426 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004427 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004428 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004429 writer.kind, writer.data, shiftOutStart);
4430 Py_XDECREF(errorHandler);
4431 Py_XDECREF(exc);
4432 _PyUnicodeWriter_Dealloc(&writer);
4433 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004434 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004435 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004436 }
4437 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004438 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004439 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004440 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004441
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004442 Py_XDECREF(errorHandler);
4443 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004444 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004445
Benjamin Peterson29060642009-01-31 22:14:21 +00004446 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004447 Py_XDECREF(errorHandler);
4448 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004449 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004450 return NULL;
4451}
4452
4453
Alexander Belopolsky40018472011-02-26 01:02:56 +00004454PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004455_PyUnicode_EncodeUTF7(PyObject *str,
4456 int base64SetO,
4457 int base64WhiteSpace,
4458 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004459{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004460 int kind;
4461 void *data;
4462 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004463 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004464 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004465 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004466 unsigned int base64bits = 0;
4467 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004468 char * out;
4469 char * start;
4470
Benjamin Petersonbac79492012-01-14 13:34:47 -05004471 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004472 return NULL;
4473 kind = PyUnicode_KIND(str);
4474 data = PyUnicode_DATA(str);
4475 len = PyUnicode_GET_LENGTH(str);
4476
4477 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004478 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004479
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004480 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004481 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004482 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004483 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004484 if (v == NULL)
4485 return NULL;
4486
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004487 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004488 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004489 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004490
Antoine Pitrou244651a2009-05-04 18:56:13 +00004491 if (inShift) {
4492 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4493 /* shifting out */
4494 if (base64bits) { /* output remaining bits */
4495 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4496 base64buffer = 0;
4497 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004498 }
4499 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004500 /* Characters not in the BASE64 set implicitly unshift the sequence
4501 so no '-' is required, except if the character is itself a '-' */
4502 if (IS_BASE64(ch) || ch == '-') {
4503 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004504 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004505 *out++ = (char) ch;
4506 }
4507 else {
4508 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004509 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004510 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004511 else { /* not in a shift sequence */
4512 if (ch == '+') {
4513 *out++ = '+';
4514 *out++ = '-';
4515 }
4516 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4517 *out++ = (char) ch;
4518 }
4519 else {
4520 *out++ = '+';
4521 inShift = 1;
4522 goto encode_char;
4523 }
4524 }
4525 continue;
4526encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004527 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004528 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004529
Antoine Pitrou244651a2009-05-04 18:56:13 +00004530 /* code first surrogate */
4531 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004532 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004533 while (base64bits >= 6) {
4534 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4535 base64bits -= 6;
4536 }
4537 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004538 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004539 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004540 base64bits += 16;
4541 base64buffer = (base64buffer << 16) | ch;
4542 while (base64bits >= 6) {
4543 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4544 base64bits -= 6;
4545 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004546 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004547 if (base64bits)
4548 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4549 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004550 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004551 if (_PyBytes_Resize(&v, out - start) < 0)
4552 return NULL;
4553 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004554}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004555PyObject *
4556PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4557 Py_ssize_t size,
4558 int base64SetO,
4559 int base64WhiteSpace,
4560 const char *errors)
4561{
4562 PyObject *result;
4563 PyObject *tmp = PyUnicode_FromUnicode(s, size);
4564 if (tmp == NULL)
4565 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004566 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004567 base64WhiteSpace, errors);
4568 Py_DECREF(tmp);
4569 return result;
4570}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004571
Antoine Pitrou244651a2009-05-04 18:56:13 +00004572#undef IS_BASE64
4573#undef FROM_BASE64
4574#undef TO_BASE64
4575#undef DECODE_DIRECT
4576#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004577
Guido van Rossumd57fd912000-03-10 22:53:23 +00004578/* --- UTF-8 Codec -------------------------------------------------------- */
4579
Alexander Belopolsky40018472011-02-26 01:02:56 +00004580PyObject *
4581PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004582 Py_ssize_t size,
4583 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004584{
Walter Dörwald69652032004-09-07 20:24:22 +00004585 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4586}
4587
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004588#include "stringlib/asciilib.h"
4589#include "stringlib/codecs.h"
4590#include "stringlib/undef.h"
4591
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004592#include "stringlib/ucs1lib.h"
4593#include "stringlib/codecs.h"
4594#include "stringlib/undef.h"
4595
4596#include "stringlib/ucs2lib.h"
4597#include "stringlib/codecs.h"
4598#include "stringlib/undef.h"
4599
4600#include "stringlib/ucs4lib.h"
4601#include "stringlib/codecs.h"
4602#include "stringlib/undef.h"
4603
Antoine Pitrouab868312009-01-10 15:40:25 +00004604/* Mask to quickly check whether a C 'long' contains a
4605 non-ASCII, UTF8-encoded char. */
4606#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004607# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004608#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004609# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004610#else
4611# error C 'long' size should be either 4 or 8!
4612#endif
4613
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004614static Py_ssize_t
4615ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004616{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004617 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004618 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004619
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004620 /*
4621 * Issue #17237: m68k is a bit different from most architectures in
4622 * that objects do not use "natural alignment" - for example, int and
4623 * long are only aligned at 2-byte boundaries. Therefore the assert()
4624 * won't work; also, tests have shown that skipping the "optimised
4625 * version" will even speed up m68k.
4626 */
4627#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004628#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004629 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4630 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004631 /* Fast path, see in STRINGLIB(utf8_decode) for
4632 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004633 /* Help allocation */
4634 const char *_p = p;
4635 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004636 while (_p < aligned_end) {
4637 unsigned long value = *(const unsigned long *) _p;
4638 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004639 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004640 *((unsigned long *)q) = value;
4641 _p += SIZEOF_LONG;
4642 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004643 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004644 p = _p;
4645 while (p < end) {
4646 if ((unsigned char)*p & 0x80)
4647 break;
4648 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004649 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004650 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004651 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004652#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004653#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004654 while (p < end) {
4655 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4656 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004657 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004658 /* Help allocation */
4659 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004660 while (_p < aligned_end) {
4661 unsigned long value = *(unsigned long *) _p;
4662 if (value & ASCII_CHAR_MASK)
4663 break;
4664 _p += SIZEOF_LONG;
4665 }
4666 p = _p;
4667 if (_p == end)
4668 break;
4669 }
4670 if ((unsigned char)*p & 0x80)
4671 break;
4672 ++p;
4673 }
4674 memcpy(dest, start, p - start);
4675 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004676}
Antoine Pitrouab868312009-01-10 15:40:25 +00004677
Victor Stinner785938e2011-12-11 20:09:03 +01004678PyObject *
4679PyUnicode_DecodeUTF8Stateful(const char *s,
4680 Py_ssize_t size,
4681 const char *errors,
4682 Py_ssize_t *consumed)
4683{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004684 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004685 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004686 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004687
4688 Py_ssize_t startinpos;
4689 Py_ssize_t endinpos;
4690 const char *errmsg = "";
4691 PyObject *errorHandler = NULL;
4692 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004693
4694 if (size == 0) {
4695 if (consumed)
4696 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004697 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004698 }
4699
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004700 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4701 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004702 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004703 *consumed = 1;
4704 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004705 }
4706
Victor Stinner8f674cc2013-04-17 23:02:17 +02004707 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004708 writer.min_length = size;
4709 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004710 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004711
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004712 writer.pos = ascii_decode(s, end, writer.data);
4713 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004714 while (s < end) {
4715 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004716 int kind = writer.kind;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004717 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004718 if (PyUnicode_IS_ASCII(writer.buffer))
4719 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004720 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004721 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004722 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004723 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004724 } else {
4725 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004726 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004727 }
4728
4729 switch (ch) {
4730 case 0:
4731 if (s == end || consumed)
4732 goto End;
4733 errmsg = "unexpected end of data";
4734 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004735 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004736 break;
4737 case 1:
4738 errmsg = "invalid start byte";
4739 startinpos = s - starts;
4740 endinpos = startinpos + 1;
4741 break;
4742 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004743 case 3:
4744 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004745 errmsg = "invalid continuation byte";
4746 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004747 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004748 break;
4749 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004750 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004751 goto onError;
4752 continue;
4753 }
4754
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004755 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004756 errors, &errorHandler,
4757 "utf-8", errmsg,
4758 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004759 &writer))
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004760 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004761 }
4762
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004763End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004764 if (consumed)
4765 *consumed = s - starts;
4766
4767 Py_XDECREF(errorHandler);
4768 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004769 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004770
4771onError:
4772 Py_XDECREF(errorHandler);
4773 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004774 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004775 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004776}
4777
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004778#ifdef __APPLE__
4779
4780/* Simplified UTF-8 decoder using surrogateescape error handler,
Victor Stinner0d92c4f2012-11-12 23:32:21 +01004781 used to decode the command line arguments on Mac OS X.
4782
4783 Return a pointer to a newly allocated wide character string (use
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004784 PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004785
4786wchar_t*
4787_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4788{
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004789 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004790 wchar_t *unicode;
4791 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004792
4793 /* Note: size will always be longer than the resulting Unicode
4794 character count */
Victor Stinnerf50e1872015-03-20 11:32:24 +01004795 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004796 return NULL;
Victor Stinner6f8eeee2013-07-07 22:57:45 +02004797 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004798 if (!unicode)
4799 return NULL;
4800
4801 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004802 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004803 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004804 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004805 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004806#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004807 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004808#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004809 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004810#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004811 if (ch > 0xFF) {
4812#if SIZEOF_WCHAR_T == 4
4813 assert(0);
4814#else
4815 assert(Py_UNICODE_IS_SURROGATE(ch));
4816 /* compute and append the two surrogates: */
4817 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4818 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4819#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004820 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004821 else {
4822 if (!ch && s == e)
4823 break;
4824 /* surrogateescape */
4825 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4826 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004827 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004828 unicode[outpos] = L'\0';
Victor Stinnerf933e1a2010-10-20 22:58:25 +00004829 return unicode;
4830}
4831
4832#endif /* __APPLE__ */
Antoine Pitrouab868312009-01-10 15:40:25 +00004833
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004834/* Primary internal function which creates utf8 encoded bytes objects.
4835
4836 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00004837 and allocate exactly as much space needed at the end. Else allocate the
4838 maximum possible needed (4 result bytes per Unicode character), and return
4839 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00004840*/
Tim Peters7e3d9612002-04-21 03:26:37 +00004841PyObject *
Victor Stinner7931d9a2011-11-04 00:22:48 +01004842_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843{
Victor Stinner6099a032011-12-18 14:22:26 +01004844 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004845 void *data;
4846 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00004847
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004848 if (!PyUnicode_Check(unicode)) {
4849 PyErr_BadArgument();
4850 return NULL;
4851 }
4852
4853 if (PyUnicode_READY(unicode) == -1)
4854 return NULL;
4855
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004856 if (PyUnicode_UTF8(unicode))
4857 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4858 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004859
4860 kind = PyUnicode_KIND(unicode);
4861 data = PyUnicode_DATA(unicode);
4862 size = PyUnicode_GET_LENGTH(unicode);
4863
Benjamin Petersonead6b532011-12-20 17:23:42 -06004864 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01004865 default:
4866 assert(0);
4867 case PyUnicode_1BYTE_KIND:
4868 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4869 assert(!PyUnicode_IS_ASCII(unicode));
4870 return ucs1lib_utf8_encoder(unicode, data, size, errors);
4871 case PyUnicode_2BYTE_KIND:
4872 return ucs2lib_utf8_encoder(unicode, data, size, errors);
4873 case PyUnicode_4BYTE_KIND:
4874 return ucs4lib_utf8_encoder(unicode, data, size, errors);
Tim Peters602f7402002-04-27 18:03:26 +00004875 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876}
4877
Alexander Belopolsky40018472011-02-26 01:02:56 +00004878PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004879PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4880 Py_ssize_t size,
4881 const char *errors)
4882{
4883 PyObject *v, *unicode;
4884
4885 unicode = PyUnicode_FromUnicode(s, size);
4886 if (unicode == NULL)
4887 return NULL;
4888 v = _PyUnicode_AsUTF8String(unicode, errors);
4889 Py_DECREF(unicode);
4890 return v;
4891}
4892
4893PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00004894PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004895{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004896 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004897}
4898
Walter Dörwald41980ca2007-08-16 21:55:45 +00004899/* --- UTF-32 Codec ------------------------------------------------------- */
4900
4901PyObject *
4902PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004903 Py_ssize_t size,
4904 const char *errors,
4905 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004906{
4907 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4908}
4909
4910PyObject *
4911PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00004912 Py_ssize_t size,
4913 const char *errors,
4914 int *byteorder,
4915 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00004916{
4917 const char *starts = s;
4918 Py_ssize_t startinpos;
4919 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004920 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00004921 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01004922 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004923 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004924 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00004925 PyObject *errorHandler = NULL;
4926 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00004927
Walter Dörwald41980ca2007-08-16 21:55:45 +00004928 q = (unsigned char *)s;
4929 e = q + size;
4930
4931 if (byteorder)
4932 bo = *byteorder;
4933
4934 /* Check for BOM marks (U+FEFF) in the input and adjust current
4935 byte order setting accordingly. In native mode, the leading BOM
4936 mark is skipped, in all other modes, it is copied to the output
4937 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01004938 if (bo == 0 && size >= 4) {
4939 Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4940 if (bom == 0x0000FEFF) {
4941 bo = -1;
4942 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00004943 }
Victor Stinnere64322e2012-10-30 23:12:47 +01004944 else if (bom == 0xFFFE0000) {
4945 bo = 1;
4946 q += 4;
4947 }
4948 if (byteorder)
4949 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00004950 }
4951
Victor Stinnere64322e2012-10-30 23:12:47 +01004952 if (q == e) {
4953 if (consumed)
4954 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004955 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00004956 }
4957
Victor Stinnere64322e2012-10-30 23:12:47 +01004958#ifdef WORDS_BIGENDIAN
4959 le = bo < 0;
4960#else
4961 le = bo <= 0;
4962#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004963 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01004964
Victor Stinner8f674cc2013-04-17 23:02:17 +02004965 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004966 writer.min_length = (e - q + 3) / 4;
4967 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004968 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01004969
Victor Stinnere64322e2012-10-30 23:12:47 +01004970 while (1) {
4971 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004972 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00004973
Victor Stinnere64322e2012-10-30 23:12:47 +01004974 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004975 enum PyUnicode_Kind kind = writer.kind;
4976 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01004977 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004978 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01004979 if (le) {
4980 do {
4981 ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4982 if (ch > maxch)
4983 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004984 if (kind != PyUnicode_1BYTE_KIND &&
4985 Py_UNICODE_IS_SURROGATE(ch))
4986 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004987 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01004988 q += 4;
4989 } while (q <= last);
4990 }
4991 else {
4992 do {
4993 ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
4994 if (ch > maxch)
4995 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02004996 if (kind != PyUnicode_1BYTE_KIND &&
4997 Py_UNICODE_IS_SURROGATE(ch))
4998 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004999 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005000 q += 4;
5001 } while (q <= last);
5002 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005003 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005004 }
5005
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005006 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005007 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005008 startinpos = ((const char *)q) - starts;
5009 endinpos = startinpos + 4;
5010 }
5011 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005012 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005013 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005014 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005015 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005016 startinpos = ((const char *)q) - starts;
5017 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005018 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005019 else {
5020 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005021 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005022 goto onError;
5023 q += 4;
5024 continue;
5025 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005026 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005027 startinpos = ((const char *)q) - starts;
5028 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005029 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005030
5031 /* The remaining input chars are ignored if the callback
5032 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005033 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005034 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005035 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005036 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005037 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005038 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005039 }
5040
Walter Dörwald41980ca2007-08-16 21:55:45 +00005041 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005042 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005043
Walter Dörwald41980ca2007-08-16 21:55:45 +00005044 Py_XDECREF(errorHandler);
5045 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005046 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005047
Benjamin Peterson29060642009-01-31 22:14:21 +00005048 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005049 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005050 Py_XDECREF(errorHandler);
5051 Py_XDECREF(exc);
5052 return NULL;
5053}
5054
5055PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005056_PyUnicode_EncodeUTF32(PyObject *str,
5057 const char *errors,
5058 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005059{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005060 enum PyUnicode_Kind kind;
5061 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005062 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005063 PyObject *v;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005064 PY_UINT32_T *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005065#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005066 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005067#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005068 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005069#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005070 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005071 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005072 PyObject *errorHandler = NULL;
5073 PyObject *exc = NULL;
5074 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005075
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005076 if (!PyUnicode_Check(str)) {
5077 PyErr_BadArgument();
5078 return NULL;
5079 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005080 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005081 return NULL;
5082 kind = PyUnicode_KIND(str);
5083 data = PyUnicode_DATA(str);
5084 len = PyUnicode_GET_LENGTH(str);
5085
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005086 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005087 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005088 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005089 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005090 if (v == NULL)
5091 return NULL;
5092
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005093 /* output buffer is 4-bytes aligned */
5094 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5095 out = (PY_UINT32_T *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005096 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005097 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005098 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005099 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005100
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005101 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005102 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005103 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005104 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005105 else
5106 encoding = "utf-32";
5107
5108 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005109 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5110 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005111 }
5112
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005113 pos = 0;
5114 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005115 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005116
5117 if (kind == PyUnicode_2BYTE_KIND) {
5118 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5119 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005120 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005121 else {
5122 assert(kind == PyUnicode_4BYTE_KIND);
5123 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5124 &out, native_ordering);
5125 }
5126 if (pos == len)
5127 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005128
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005129 rep = unicode_encode_call_errorhandler(
5130 errors, &errorHandler,
5131 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005132 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005133 if (!rep)
5134 goto error;
5135
5136 if (PyBytes_Check(rep)) {
5137 repsize = PyBytes_GET_SIZE(rep);
5138 if (repsize & 3) {
5139 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005140 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005141 "surrogates not allowed");
5142 goto error;
5143 }
5144 moreunits = repsize / 4;
5145 }
5146 else {
5147 assert(PyUnicode_Check(rep));
5148 if (PyUnicode_READY(rep) < 0)
5149 goto error;
5150 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5151 if (!PyUnicode_IS_ASCII(rep)) {
5152 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005153 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005154 "surrogates not allowed");
5155 goto error;
5156 }
5157 }
5158
5159 /* four bytes are reserved for each surrogate */
5160 if (moreunits > 1) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005161 Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005162 Py_ssize_t morebytes = 4 * (moreunits - 1);
5163 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5164 /* integer overflow */
5165 PyErr_NoMemory();
5166 goto error;
5167 }
5168 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5169 goto error;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005170 out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005171 }
5172
5173 if (PyBytes_Check(rep)) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005174 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5175 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005176 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005177 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005178 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5179 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005180 }
5181
5182 Py_CLEAR(rep);
5183 }
5184
5185 /* Cut back to size actually needed. This is necessary for, for example,
5186 encoding of a string containing isolated surrogates and the 'ignore'
5187 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005188 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005189 if (nsize != PyBytes_GET_SIZE(v))
5190 _PyBytes_Resize(&v, nsize);
5191 Py_XDECREF(errorHandler);
5192 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005193 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005194 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005195 error:
5196 Py_XDECREF(rep);
5197 Py_XDECREF(errorHandler);
5198 Py_XDECREF(exc);
5199 Py_XDECREF(v);
5200 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005201}
5202
Alexander Belopolsky40018472011-02-26 01:02:56 +00005203PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005204PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5205 Py_ssize_t size,
5206 const char *errors,
5207 int byteorder)
5208{
5209 PyObject *result;
5210 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5211 if (tmp == NULL)
5212 return NULL;
5213 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5214 Py_DECREF(tmp);
5215 return result;
5216}
5217
5218PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005219PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005220{
Victor Stinnerb960b342011-11-20 19:12:52 +01005221 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005222}
5223
Guido van Rossumd57fd912000-03-10 22:53:23 +00005224/* --- UTF-16 Codec ------------------------------------------------------- */
5225
Tim Peters772747b2001-08-09 22:21:55 +00005226PyObject *
5227PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005228 Py_ssize_t size,
5229 const char *errors,
5230 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231{
Walter Dörwald69652032004-09-07 20:24:22 +00005232 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5233}
5234
5235PyObject *
5236PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005237 Py_ssize_t size,
5238 const char *errors,
5239 int *byteorder,
5240 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005241{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005242 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005243 Py_ssize_t startinpos;
5244 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005245 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005246 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005247 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005248 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005249 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005250 PyObject *errorHandler = NULL;
5251 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005252 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253
Tim Peters772747b2001-08-09 22:21:55 +00005254 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005255 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256
5257 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005258 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005260 /* Check for BOM marks (U+FEFF) in the input and adjust current
5261 byte order setting accordingly. In native mode, the leading BOM
5262 mark is skipped, in all other modes, it is copied to the output
5263 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005264 if (bo == 0 && size >= 2) {
5265 const Py_UCS4 bom = (q[1] << 8) | q[0];
5266 if (bom == 0xFEFF) {
5267 q += 2;
5268 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005269 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005270 else if (bom == 0xFFFE) {
5271 q += 2;
5272 bo = 1;
5273 }
5274 if (byteorder)
5275 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005276 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277
Antoine Pitrou63065d72012-05-15 23:48:04 +02005278 if (q == e) {
5279 if (consumed)
5280 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005281 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005282 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005283
Christian Heimes743e0cd2012-10-17 23:52:17 +02005284#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005285 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005286 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005287#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005288 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005289 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005290#endif
Tim Peters772747b2001-08-09 22:21:55 +00005291
Antoine Pitrou63065d72012-05-15 23:48:04 +02005292 /* Note: size will always be longer than the resulting Unicode
5293 character count */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005294 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005295 writer.min_length = (e - q + 1) / 2;
5296 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005297 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005298
Antoine Pitrou63065d72012-05-15 23:48:04 +02005299 while (1) {
5300 Py_UCS4 ch = 0;
5301 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005302 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005303 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005304 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005305 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005306 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005307 native_ordering);
5308 else
5309 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005310 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005311 native_ordering);
5312 } else if (kind == PyUnicode_2BYTE_KIND) {
5313 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005314 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005315 native_ordering);
5316 } else {
5317 assert(kind == PyUnicode_4BYTE_KIND);
5318 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005319 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005320 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005321 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005322 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005323
Antoine Pitrou63065d72012-05-15 23:48:04 +02005324 switch (ch)
5325 {
5326 case 0:
5327 /* remaining byte at the end? (size should be even) */
5328 if (q == e || consumed)
5329 goto End;
5330 errmsg = "truncated data";
5331 startinpos = ((const char *)q) - starts;
5332 endinpos = ((const char *)e) - starts;
5333 break;
5334 /* The remaining input chars are ignored if the callback
5335 chooses to skip the input */
5336 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005337 q -= 2;
5338 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005339 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005340 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005341 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005342 endinpos = ((const char *)e) - starts;
5343 break;
5344 case 2:
5345 errmsg = "illegal encoding";
5346 startinpos = ((const char *)q) - 2 - starts;
5347 endinpos = startinpos + 2;
5348 break;
5349 case 3:
5350 errmsg = "illegal UTF-16 surrogate";
5351 startinpos = ((const char *)q) - 4 - starts;
5352 endinpos = startinpos + 2;
5353 break;
5354 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005355 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005356 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005357 continue;
5358 }
5359
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005360 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005361 errors,
5362 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005363 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005364 &starts,
5365 (const char **)&e,
5366 &startinpos,
5367 &endinpos,
5368 &exc,
5369 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005370 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005371 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372 }
5373
Antoine Pitrou63065d72012-05-15 23:48:04 +02005374End:
Walter Dörwald69652032004-09-07 20:24:22 +00005375 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005376 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005377
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005378 Py_XDECREF(errorHandler);
5379 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005380 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381
Benjamin Peterson29060642009-01-31 22:14:21 +00005382 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005383 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005384 Py_XDECREF(errorHandler);
5385 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386 return NULL;
5387}
5388
Tim Peters772747b2001-08-09 22:21:55 +00005389PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005390_PyUnicode_EncodeUTF16(PyObject *str,
5391 const char *errors,
5392 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005393{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005394 enum PyUnicode_Kind kind;
5395 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005396 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005397 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005398 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005399 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005400#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005401 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005402#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005403 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005404#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005405 const char *encoding;
5406 Py_ssize_t nsize, pos;
5407 PyObject *errorHandler = NULL;
5408 PyObject *exc = NULL;
5409 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005410
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005411 if (!PyUnicode_Check(str)) {
5412 PyErr_BadArgument();
5413 return NULL;
5414 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005415 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005416 return NULL;
5417 kind = PyUnicode_KIND(str);
5418 data = PyUnicode_DATA(str);
5419 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005420
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005421 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005422 if (kind == PyUnicode_4BYTE_KIND) {
5423 const Py_UCS4 *in = (const Py_UCS4 *)data;
5424 const Py_UCS4 *end = in + len;
5425 while (in < end)
5426 if (*in++ >= 0x10000)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005427 pairs++;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005428 }
5429 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
Benjamin Peterson29060642009-01-31 22:14:21 +00005430 return PyErr_NoMemory();
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005431 nsize = len + pairs + (byteorder == 0);
5432 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433 if (v == NULL)
5434 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005436 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005437 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005438 out = (unsigned short *)PyBytes_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005439 if (byteorder == 0)
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005440 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005441 if (len == 0)
Guido van Rossum98297ee2007-11-06 21:34:58 +00005442 goto done;
Tim Peters772747b2001-08-09 22:21:55 +00005443
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005444 if (kind == PyUnicode_1BYTE_KIND) {
5445 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5446 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005447 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005448
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005449 if (byteorder < 0)
5450 encoding = "utf-16-le";
5451 else if (byteorder > 0)
5452 encoding = "utf-16-be";
5453 else
5454 encoding = "utf-16";
5455
5456 pos = 0;
5457 while (pos < len) {
5458 Py_ssize_t repsize, moreunits;
5459
5460 if (kind == PyUnicode_2BYTE_KIND) {
5461 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5462 &out, native_ordering);
5463 }
5464 else {
5465 assert(kind == PyUnicode_4BYTE_KIND);
5466 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5467 &out, native_ordering);
5468 }
5469 if (pos == len)
5470 break;
5471
5472 rep = unicode_encode_call_errorhandler(
5473 errors, &errorHandler,
5474 encoding, "surrogates not allowed",
5475 str, &exc, pos, pos + 1, &pos);
5476 if (!rep)
5477 goto error;
5478
5479 if (PyBytes_Check(rep)) {
5480 repsize = PyBytes_GET_SIZE(rep);
5481 if (repsize & 1) {
5482 raise_encode_exception(&exc, encoding,
5483 str, pos - 1, pos,
5484 "surrogates not allowed");
5485 goto error;
5486 }
5487 moreunits = repsize / 2;
5488 }
5489 else {
5490 assert(PyUnicode_Check(rep));
5491 if (PyUnicode_READY(rep) < 0)
5492 goto error;
5493 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5494 if (!PyUnicode_IS_ASCII(rep)) {
5495 raise_encode_exception(&exc, encoding,
5496 str, pos - 1, pos,
5497 "surrogates not allowed");
5498 goto error;
5499 }
5500 }
5501
5502 /* two bytes are reserved for each surrogate */
5503 if (moreunits > 1) {
5504 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5505 Py_ssize_t morebytes = 2 * (moreunits - 1);
5506 if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5507 /* integer overflow */
5508 PyErr_NoMemory();
5509 goto error;
5510 }
5511 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5512 goto error;
5513 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5514 }
5515
5516 if (PyBytes_Check(rep)) {
5517 Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5518 out += moreunits;
5519 } else /* rep is unicode */ {
5520 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5521 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5522 &out, native_ordering);
5523 }
5524
5525 Py_CLEAR(rep);
5526 }
5527
5528 /* Cut back to size actually needed. This is necessary for, for example,
5529 encoding of a string containing isolated surrogates and the 'ignore' handler
5530 is used. */
5531 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5532 if (nsize != PyBytes_GET_SIZE(v))
5533 _PyBytes_Resize(&v, nsize);
5534 Py_XDECREF(errorHandler);
5535 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00005536 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005537 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005538 error:
5539 Py_XDECREF(rep);
5540 Py_XDECREF(errorHandler);
5541 Py_XDECREF(exc);
5542 Py_XDECREF(v);
5543 return NULL;
5544#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545}
5546
Alexander Belopolsky40018472011-02-26 01:02:56 +00005547PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005548PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5549 Py_ssize_t size,
5550 const char *errors,
5551 int byteorder)
5552{
5553 PyObject *result;
5554 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5555 if (tmp == NULL)
5556 return NULL;
5557 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5558 Py_DECREF(tmp);
5559 return result;
5560}
5561
5562PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005563PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005565 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566}
5567
5568/* --- Unicode Escape Codec ----------------------------------------------- */
5569
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005570/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5571 if all the escapes in the string make it still a valid ASCII string.
5572 Returns -1 if any escapes were found which cause the string to
5573 pop out of ASCII range. Otherwise returns the length of the
5574 required buffer to hold the string.
5575 */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02005576static Py_ssize_t
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005577length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5578{
5579 const unsigned char *p = (const unsigned char *)s;
5580 const unsigned char *end = p + size;
5581 Py_ssize_t length = 0;
5582
5583 if (size < 0)
5584 return -1;
5585
5586 for (; p < end; ++p) {
5587 if (*p > 127) {
5588 /* Non-ASCII */
5589 return -1;
5590 }
5591 else if (*p != '\\') {
5592 /* Normal character */
5593 ++length;
5594 }
5595 else {
5596 /* Backslash-escape, check next char */
5597 ++p;
5598 /* Escape sequence reaches till end of string or
5599 non-ASCII follow-up. */
5600 if (p >= end || *p > 127)
5601 return -1;
5602 switch (*p) {
5603 case '\n':
5604 /* backslash + \n result in zero characters */
5605 break;
5606 case '\\': case '\'': case '\"':
5607 case 'b': case 'f': case 't':
5608 case 'n': case 'r': case 'v': case 'a':
5609 ++length;
5610 break;
5611 case '0': case '1': case '2': case '3':
5612 case '4': case '5': case '6': case '7':
5613 case 'x': case 'u': case 'U': case 'N':
5614 /* these do not guarantee ASCII characters */
5615 return -1;
5616 default:
5617 /* count the backslash + the other character */
5618 length += 2;
5619 }
5620 }
5621 }
5622 return length;
5623}
5624
Fredrik Lundh06d12682001-01-24 07:59:11 +00005625static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00005626
Alexander Belopolsky40018472011-02-26 01:02:56 +00005627PyObject *
5628PyUnicode_DecodeUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005629 Py_ssize_t size,
Victor Stinnerc17f5402011-09-29 00:16:58 +02005630 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005632 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005633 Py_ssize_t startinpos;
5634 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005635 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005637 char* message;
5638 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005639 PyObject *errorHandler = NULL;
5640 PyObject *exc = NULL;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005641 Py_ssize_t len;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005642
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005643 len = length_of_escaped_ascii_string(s, size);
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005644 if (len == 0)
5645 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005646
5647 /* After length_of_escaped_ascii_string() there are two alternatives,
5648 either the string is pure ASCII with named escapes like \n, etc.
5649 and we determined it's exact size (common case)
5650 or it contains \x, \u, ... escape sequences. then we create a
5651 legacy wchar string and resize it at the end of this function. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005652 _PyUnicodeWriter_Init(&writer);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005653 if (len > 0) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02005654 writer.min_length = len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005655 }
5656 else {
5657 /* Escaped strings will always be longer than the resulting
5658 Unicode string, so we start with size here and then reduce the
5659 length after conversion to the true value.
5660 (but if the error callback returns a long replacement string
5661 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005662 writer.min_length = size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005663 }
5664
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665 if (size == 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005666 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005667 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005668
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669 while (s < end) {
5670 unsigned char c;
Victor Stinner24729f32011-11-10 20:31:37 +01005671 Py_UCS4 x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005672 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673
5674 /* Non-escape characters are interpreted as Unicode ordinals */
5675 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005676 x = (unsigned char)*s;
5677 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005678 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005679 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680 continue;
5681 }
5682
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005683 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684 /* \ - Escapes */
5685 s++;
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005686 c = *s++;
5687 if (s > end)
5688 c = '\0'; /* Invalid after \ */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005689
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005690 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691
Benjamin Peterson29060642009-01-31 22:14:21 +00005692 /* \x escapes */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005693#define WRITECHAR(ch) \
5694 do { \
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005695 if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0) \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005696 goto onError; \
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005697 } while(0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005698
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 case '\n': break;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005700 case '\\': WRITECHAR('\\'); break;
5701 case '\'': WRITECHAR('\''); break;
5702 case '\"': WRITECHAR('\"'); break;
5703 case 'b': WRITECHAR('\b'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005704 /* FF */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005705 case 'f': WRITECHAR('\014'); break;
5706 case 't': WRITECHAR('\t'); break;
5707 case 'n': WRITECHAR('\n'); break;
5708 case 'r': WRITECHAR('\r'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005709 /* VT */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005710 case 'v': WRITECHAR('\013'); break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005711 /* BEL, not classic C */
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005712 case 'a': WRITECHAR('\007'); break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713
Benjamin Peterson29060642009-01-31 22:14:21 +00005714 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715 case '0': case '1': case '2': case '3':
5716 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005717 x = s[-1] - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005718 if (s < end && '0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005719 x = (x<<3) + *s++ - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00005720 if (s < end && '0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00005721 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005722 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005723 WRITECHAR(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724 break;
5725
Benjamin Peterson29060642009-01-31 22:14:21 +00005726 /* hex escapes */
5727 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005728 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005729 digits = 2;
5730 message = "truncated \\xXX escape";
5731 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005732
Benjamin Peterson29060642009-01-31 22:14:21 +00005733 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005735 digits = 4;
5736 message = "truncated \\uXXXX escape";
5737 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738
Benjamin Peterson29060642009-01-31 22:14:21 +00005739 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00005740 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00005741 digits = 8;
5742 message = "truncated \\UXXXXXXXX escape";
5743 hexescape:
5744 chr = 0;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005745 if (end - s < digits) {
5746 /* count only hex digits */
5747 for (; s < end; ++s) {
5748 c = (unsigned char)*s;
5749 if (!Py_ISXDIGIT(c))
5750 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005751 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005752 goto error;
5753 }
5754 for (; digits--; ++s) {
5755 c = (unsigned char)*s;
5756 if (!Py_ISXDIGIT(c))
5757 goto error;
Fredrik Lundhdf846752000-09-03 11:29:49 +00005758 chr = (chr<<4) & ~0xF;
5759 if (c >= '0' && c <= '9')
5760 chr += c - '0';
5761 else if (c >= 'a' && c <= 'f')
5762 chr += 10 + c - 'a';
5763 else
5764 chr += 10 + c - 'A';
5765 }
Jeremy Hylton504de6b2003-10-06 05:08:26 +00005766 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005767 /* _decoding_error will have already written into the
5768 target buffer. */
5769 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005770 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00005771 /* when we get here, chr is a 32-bit unicode character */
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005772 message = "illegal Unicode character";
5773 if (chr > MAX_UNICODE)
Serhiy Storchakad6793772013-01-29 10:20:44 +02005774 goto error;
Serhiy Storchaka24193de2013-01-29 10:28:07 +02005775 WRITECHAR(chr);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005776 break;
5777
Benjamin Peterson29060642009-01-31 22:14:21 +00005778 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00005779 case 'N':
5780 message = "malformed \\N character escape";
5781 if (ucnhash_CAPI == NULL) {
5782 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005783 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5784 PyUnicodeData_CAPSULE_NAME, 1);
Fredrik Lundhccc74732001-02-18 22:13:49 +00005785 if (ucnhash_CAPI == NULL)
5786 goto ucnhashError;
5787 }
5788 if (*s == '{') {
5789 const char *start = s+1;
5790 /* look for the closing brace */
5791 while (*s != '}' && s < end)
5792 s++;
5793 if (s > start && s < end && *s == '}') {
5794 /* found a name. look it up in the unicode database */
5795 message = "unknown Unicode character name";
5796 s++;
Serhiy Storchaka4f5f0e52013-01-21 11:38:00 +02005797 if (s - start - 1 <= INT_MAX &&
Serhiy Storchakac35f3a92013-01-21 11:42:57 +02005798 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
Ezio Melotti931b8aa2011-10-21 21:57:36 +03005799 &chr, 0))
Fredrik Lundhccc74732001-02-18 22:13:49 +00005800 goto store;
5801 }
5802 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005803 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00005804
5805 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00005806 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005807 message = "\\ at end of string";
5808 s--;
Serhiy Storchakad6793772013-01-29 10:20:44 +02005809 goto error;
Walter Dörwald8c077222002-03-25 11:16:18 +00005810 }
5811 else {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005812 WRITECHAR('\\');
Serhiy Storchaka73e38802013-01-25 23:52:21 +02005813 WRITECHAR((unsigned char)s[-1]);
Walter Dörwald8c077222002-03-25 11:16:18 +00005814 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00005815 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005816 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02005817 continue;
5818
5819 error:
5820 endinpos = s-starts;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005821 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02005822 errors, &errorHandler,
5823 "unicodeescape", message,
5824 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02005825 &writer))
Serhiy Storchakad6793772013-01-29 10:20:44 +02005826 goto onError;
5827 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828 }
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005829#undef WRITECHAR
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005830
Walter Dörwaldd4ade082003-08-15 15:00:26 +00005831 Py_XDECREF(errorHandler);
5832 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005833 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00005834
Benjamin Peterson29060642009-01-31 22:14:21 +00005835 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00005836 PyErr_SetString(
5837 PyExc_UnicodeError,
5838 "\\N escapes not supported (can't load unicodedata module)"
5839 );
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005840 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005841 Py_XDECREF(errorHandler);
5842 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00005843 return NULL;
5844
Benjamin Peterson29060642009-01-31 22:14:21 +00005845 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005846 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005847 Py_XDECREF(errorHandler);
5848 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005849 return NULL;
5850}
5851
5852/* Return a Unicode-Escape string version of the Unicode object.
5853
5854 If quotes is true, the string is enclosed in u"" or u'' quotes as
5855 appropriate.
5856
5857*/
5858
Alexander Belopolsky40018472011-02-26 01:02:56 +00005859PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005860PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005862 Py_ssize_t i, len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005863 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864 char *p;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005865 int kind;
5866 void *data;
5867 Py_ssize_t expandsize = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868
Ezio Melottie7f90372012-10-05 03:33:31 +03005869 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00005870 escape.
5871
Ezio Melottie7f90372012-10-05 03:33:31 +03005872 For UCS1 strings it's '\xxx', 4 bytes per source character.
5873 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5874 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00005875 */
5876
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005877 if (!PyUnicode_Check(unicode)) {
5878 PyErr_BadArgument();
5879 return NULL;
5880 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005881 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005882 return NULL;
5883 len = PyUnicode_GET_LENGTH(unicode);
5884 kind = PyUnicode_KIND(unicode);
5885 data = PyUnicode_DATA(unicode);
Benjamin Petersonead6b532011-12-20 17:23:42 -06005886 switch (kind) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005887 case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5888 case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5889 case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5890 }
5891
5892 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005893 return PyBytes_FromStringAndSize(NULL, 0);
5894
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005895 if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00005896 return PyErr_NoMemory();
Neal Norwitz3ce5d922008-08-24 07:08:55 +00005897
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005898 repr = PyBytes_FromStringAndSize(NULL,
Benjamin Peterson29060642009-01-31 22:14:21 +00005899 2
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005900 + expandsize*len
Benjamin Peterson29060642009-01-31 22:14:21 +00005901 + 1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005902 if (repr == NULL)
5903 return NULL;
5904
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005905 p = PyBytes_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005906
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005907 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01005908 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005909
Walter Dörwald79e913e2007-05-12 11:08:06 +00005910 /* Escape backslashes */
5911 if (ch == '\\') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912 *p++ = '\\';
5913 *p++ = (char) ch;
Walter Dörwald79e913e2007-05-12 11:08:06 +00005914 continue;
Tim Petersced69f82003-09-16 20:30:58 +00005915 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005916
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005917 /* Map 21-bit characters to '\U00xxxxxx' */
5918 else if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005919 assert(ch <= MAX_UNICODE);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005920 *p++ = '\\';
5921 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005922 *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5923 *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5924 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5925 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5926 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5927 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5928 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5929 *p++ = Py_hexdigits[ch & 0x0000000F];
Benjamin Peterson29060642009-01-31 22:14:21 +00005930 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005931 }
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005932
Guido van Rossumd57fd912000-03-10 22:53:23 +00005933 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00005934 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935 *p++ = '\\';
5936 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005937 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5938 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5939 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5940 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00005941 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005942
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005943 /* Map special whitespace to '\t', \n', '\r' */
5944 else if (ch == '\t') {
5945 *p++ = '\\';
5946 *p++ = 't';
5947 }
5948 else if (ch == '\n') {
5949 *p++ = '\\';
5950 *p++ = 'n';
5951 }
5952 else if (ch == '\r') {
5953 *p++ = '\\';
5954 *p++ = 'r';
5955 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005956
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005957 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00005958 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00005960 *p++ = 'x';
Victor Stinnerf5cff562011-10-14 02:13:11 +02005961 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5962 *p++ = Py_hexdigits[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00005963 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00005964
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965 /* Copy everything else as-is */
5966 else
5967 *p++ = (char) ch;
5968 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005969
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005970 assert(p - PyBytes_AS_STRING(repr) > 0);
5971 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5972 return NULL;
5973 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974}
5975
Alexander Belopolsky40018472011-02-26 01:02:56 +00005976PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005977PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5978 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005980 PyObject *result;
5981 PyObject *tmp = PyUnicode_FromUnicode(s, size);
5982 if (tmp == NULL)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005984 result = PyUnicode_AsUnicodeEscapeString(tmp);
5985 Py_DECREF(tmp);
5986 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005987}
5988
5989/* --- Raw Unicode Escape Codec ------------------------------------------- */
5990
Alexander Belopolsky40018472011-02-26 01:02:56 +00005991PyObject *
5992PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005993 Py_ssize_t size,
5994 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005996 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005997 Py_ssize_t startinpos;
5998 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005999 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000 const char *end;
6001 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006002 PyObject *errorHandler = NULL;
6003 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006004
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006005 if (size == 0)
6006 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006007
Guido van Rossumd57fd912000-03-10 22:53:23 +00006008 /* Escaped strings will always be longer than the resulting
6009 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006010 length after conversion to the true value. (But decoding error
6011 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006012 _PyUnicodeWriter_Init(&writer);
6013 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006014
Guido van Rossumd57fd912000-03-10 22:53:23 +00006015 end = s + size;
6016 while (s < end) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006017 unsigned char c;
6018 Py_UCS4 x;
6019 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006020 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021
Benjamin Peterson29060642009-01-31 22:14:21 +00006022 /* Non-escape characters are interpreted as Unicode ordinals */
6023 if (*s != '\\') {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006024 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006025 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006026 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006027 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006028 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006029 startinpos = s-starts;
6030
6031 /* \u-escapes are only interpreted iff the number of leading
6032 backslashes if odd */
6033 bs = s;
6034 for (;s < end;) {
6035 if (*s != '\\')
6036 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006037 x = (unsigned char)*s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006038 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006039 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006040 }
6041 if (((s - bs) & 1) == 0 ||
6042 s >= end ||
6043 (*s != 'u' && *s != 'U')) {
6044 continue;
6045 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006046 writer.pos--;
Benjamin Peterson29060642009-01-31 22:14:21 +00006047 count = *s=='u' ? 4 : 8;
6048 s++;
6049
6050 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Benjamin Peterson29060642009-01-31 22:14:21 +00006051 for (x = 0, i = 0; i < count; ++i, ++s) {
6052 c = (unsigned char)*s;
David Malcolm96960882010-11-05 17:23:41 +00006053 if (!Py_ISXDIGIT(c)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006054 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006055 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006056 errors, &errorHandler,
6057 "rawunicodeescape", "truncated \\uXXXX",
6058 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006059 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006060 goto onError;
6061 goto nextByte;
6062 }
6063 x = (x<<4) & ~0xF;
6064 if (c >= '0' && c <= '9')
6065 x += c - '0';
6066 else if (c >= 'a' && c <= 'f')
6067 x += 10 + c - 'a';
6068 else
6069 x += 10 + c - 'A';
6070 }
Victor Stinner8faf8212011-12-08 22:14:11 +01006071 if (x <= MAX_UNICODE) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006072 if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006073 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006074 }
6075 else {
Christian Heimesfe337bf2008-03-23 21:54:12 +00006076 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006077 if (unicode_decode_call_errorhandler_writer(
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006078 errors, &errorHandler,
6079 "rawunicodeescape", "\\Uxxxxxxxx out of range",
Benjamin Peterson29060642009-01-31 22:14:21 +00006080 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006081 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006082 goto onError;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006083 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006084 nextByte:
6085 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006086 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006087 Py_XDECREF(errorHandler);
6088 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006089 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006090
Benjamin Peterson29060642009-01-31 22:14:21 +00006091 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006092 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006093 Py_XDECREF(errorHandler);
6094 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006095 return NULL;
6096}
6097
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006098
Alexander Belopolsky40018472011-02-26 01:02:56 +00006099PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006100PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006101{
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006102 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103 char *p;
6104 char *q;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006105 Py_ssize_t expandsize, pos;
6106 int kind;
6107 void *data;
6108 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006110 if (!PyUnicode_Check(unicode)) {
6111 PyErr_BadArgument();
6112 return NULL;
6113 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006114 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006115 return NULL;
6116 kind = PyUnicode_KIND(unicode);
6117 data = PyUnicode_DATA(unicode);
6118 len = PyUnicode_GET_LENGTH(unicode);
Benjamin Peterson1518e872011-11-23 10:44:52 -06006119 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6120 bytes, and 1 byte characters 4. */
6121 expandsize = kind * 2 + 2;
Victor Stinner0e368262011-11-10 20:12:49 +01006122
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006123 if (len > PY_SSIZE_T_MAX / expandsize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006124 return PyErr_NoMemory();
Benjamin Peterson14339b62009-01-31 16:36:08 +00006125
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006126 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127 if (repr == NULL)
6128 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006129 if (len == 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006130 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006131
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006132 p = q = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006133 for (pos = 0; pos < len; pos++) {
6134 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006135 /* Map 32-bit characters to '\Uxxxxxxxx' */
6136 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01006137 assert(ch <= MAX_UNICODE);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006138 *p++ = '\\';
6139 *p++ = 'U';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006140 *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6141 *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6142 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6143 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6144 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6145 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6146 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6147 *p++ = Py_hexdigits[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00006148 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006149 /* Map 16-bit characters to '\uxxxx' */
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006150 else if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151 *p++ = '\\';
6152 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006153 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6154 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6155 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6156 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006158 /* Copy everything else as-is */
6159 else
Guido van Rossumd57fd912000-03-10 22:53:23 +00006160 *p++ = (char) ch;
6161 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006162
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006163 assert(p > q);
6164 if (_PyBytes_Resize(&repr, p - q) < 0)
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006165 return NULL;
6166 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167}
6168
Alexander Belopolsky40018472011-02-26 01:02:56 +00006169PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006170PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6171 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006173 PyObject *result;
6174 PyObject *tmp = PyUnicode_FromUnicode(s, size);
6175 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006176 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006177 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6178 Py_DECREF(tmp);
6179 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180}
6181
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006182/* --- Unicode Internal Codec ------------------------------------------- */
6183
Alexander Belopolsky40018472011-02-26 01:02:56 +00006184PyObject *
6185_PyUnicode_DecodeUnicodeInternal(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006186 Py_ssize_t size,
6187 const char *errors)
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006188{
6189 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006190 Py_ssize_t startinpos;
6191 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006192 _PyUnicodeWriter writer;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006193 const char *end;
6194 const char *reason;
6195 PyObject *errorHandler = NULL;
6196 PyObject *exc = NULL;
6197
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006198 if (PyErr_WarnEx(PyExc_DeprecationWarning,
Ezio Melotti11060a42011-11-16 09:39:10 +02006199 "unicode_internal codec has been deprecated",
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006200 1))
6201 return NULL;
6202
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006203 if (size == 0)
6204 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006205
Victor Stinner8f674cc2013-04-17 23:02:17 +02006206 _PyUnicodeWriter_Init(&writer);
6207 if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6208 PyErr_NoMemory();
Benjamin Peterson29060642009-01-31 22:14:21 +00006209 goto onError;
Victor Stinner8f674cc2013-04-17 23:02:17 +02006210 }
6211 writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006212
Victor Stinner8f674cc2013-04-17 23:02:17 +02006213 end = s + size;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006214 while (s < end) {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006215 Py_UNICODE uch;
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006216 Py_UCS4 ch;
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006217 if (end - s < Py_UNICODE_SIZE) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006218 endinpos = end-starts;
6219 reason = "truncated input";
6220 goto error;
6221 }
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006222 /* We copy the raw representation one byte at a time because the
6223 pointer may be unaligned (see test_codeccallbacks). */
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006224 ((char *) &uch)[0] = s[0];
6225 ((char *) &uch)[1] = s[1];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006226#ifdef Py_UNICODE_WIDE
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006227 ((char *) &uch)[2] = s[2];
6228 ((char *) &uch)[3] = s[3];
Antoine Pitrou44c6aff2011-11-11 02:59:42 +01006229#endif
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006230 ch = uch;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006231#ifdef Py_UNICODE_WIDE
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006232 /* We have to sanity check the raw data, otherwise doom looms for
6233 some malformed UCS-4 data. */
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006234 if (ch > 0x10ffff) {
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006235 endinpos = s - starts + Py_UNICODE_SIZE;
6236 reason = "illegal code point (> 0x10FFFF)";
6237 goto error;
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006238 }
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006239#endif
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006240 s += Py_UNICODE_SIZE;
6241#ifndef Py_UNICODE_WIDE
Serhiy Storchaka03ee12e2013-02-07 16:25:25 +02006242 if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006243 {
Antoine Pitrou0290c7a2011-11-11 13:29:12 +01006244 Py_UNICODE uch2;
6245 ((char *) &uch2)[0] = s[0];
6246 ((char *) &uch2)[1] = s[1];
Victor Stinner551ac952011-11-29 22:58:13 +01006247 if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006248 {
Victor Stinner551ac952011-11-29 22:58:13 +01006249 ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006250 s += Py_UNICODE_SIZE;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006251 }
6252 }
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006253#endif
6254
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006255 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinner9f4b1e92011-11-10 20:56:30 +01006256 goto onError;
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006257 continue;
6258
6259 error:
6260 startinpos = s - starts;
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006261 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006262 errors, &errorHandler,
6263 "unicode_internal", reason,
6264 &starts, &end, &startinpos, &endinpos, &exc, &s,
Serhiy Storchakad0c79dc2013-02-07 16:26:55 +02006265 &writer))
Serhiy Storchaka3fd4ab32013-02-07 16:23:21 +02006266 goto onError;
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006267 }
6268
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006269 Py_XDECREF(errorHandler);
6270 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006271 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006272
Benjamin Peterson29060642009-01-31 22:14:21 +00006273 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006274 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwalda47d1c02005-08-30 10:23:14 +00006275 Py_XDECREF(errorHandler);
6276 Py_XDECREF(exc);
6277 return NULL;
6278}
6279
Guido van Rossumd57fd912000-03-10 22:53:23 +00006280/* --- Latin-1 Codec ------------------------------------------------------ */
6281
Alexander Belopolsky40018472011-02-26 01:02:56 +00006282PyObject *
6283PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006284 Py_ssize_t size,
6285 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006288 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289}
6290
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006291/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006292static void
6293make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006294 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006295 PyObject *unicode,
6296 Py_ssize_t startpos, Py_ssize_t endpos,
6297 const char *reason)
6298{
6299 if (*exceptionObject == NULL) {
6300 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006301 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006302 encoding, unicode, startpos, endpos, reason);
6303 }
6304 else {
6305 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6306 goto onError;
6307 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6308 goto onError;
6309 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6310 goto onError;
6311 return;
6312 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006313 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006314 }
6315}
6316
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006317/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006318static void
6319raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006320 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006321 PyObject *unicode,
6322 Py_ssize_t startpos, Py_ssize_t endpos,
6323 const char *reason)
6324{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006325 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006326 encoding, unicode, startpos, endpos, reason);
6327 if (*exceptionObject != NULL)
6328 PyCodec_StrictErrors(*exceptionObject);
6329}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006330
6331/* error handling callback helper:
6332 build arguments, call the callback and check the arguments,
6333 put the result into newpos and return the replacement string, which
6334 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006335static PyObject *
6336unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006337 PyObject **errorHandler,
6338 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006339 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006340 Py_ssize_t startpos, Py_ssize_t endpos,
6341 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006342{
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006343 static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006344 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006345 PyObject *restuple;
6346 PyObject *resunicode;
6347
6348 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006349 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006350 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006351 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006352 }
6353
Benjamin Petersonbac79492012-01-14 13:34:47 -05006354 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006355 return NULL;
6356 len = PyUnicode_GET_LENGTH(unicode);
6357
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006358 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006359 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006360 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006361 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006362
6363 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00006364 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006365 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006366 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006367 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006368 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006369 Py_DECREF(restuple);
6370 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006371 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006372 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006373 &resunicode, newpos)) {
6374 Py_DECREF(restuple);
6375 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006376 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006377 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6378 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6379 Py_DECREF(restuple);
6380 return NULL;
6381 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006382 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006383 *newpos = len + *newpos;
6384 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006385 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006386 Py_DECREF(restuple);
6387 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006388 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006389 Py_INCREF(resunicode);
6390 Py_DECREF(restuple);
6391 return resunicode;
6392}
6393
Alexander Belopolsky40018472011-02-26 01:02:56 +00006394static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006395unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006396 const char *errors,
Victor Stinnerfcd96532011-11-04 00:28:50 +01006397 unsigned int limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006398{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006399 /* input state */
6400 Py_ssize_t pos=0, size;
6401 int kind;
6402 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006403 /* output object */
6404 PyObject *res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006405 /* pointer into the output */
6406 char *str;
6407 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00006408 Py_ssize_t ressize;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006409 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6410 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006411 PyObject *errorHandler = NULL;
6412 PyObject *exc = NULL;
6413 /* the following variable is used for caching string comparisons
6414 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6415 int known_errorHandler = -1;
6416
Benjamin Petersonbac79492012-01-14 13:34:47 -05006417 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006418 return NULL;
6419 size = PyUnicode_GET_LENGTH(unicode);
6420 kind = PyUnicode_KIND(unicode);
6421 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006422 /* allocate enough for a simple encoding without
6423 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006424 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006425 return PyBytes_FromStringAndSize(NULL, 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006426 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006427 if (res == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006428 return NULL;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006429 str = PyBytes_AS_STRING(res);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006430 ressize = size;
6431
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006432 while (pos < size) {
6433 Py_UCS4 c = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006434
Benjamin Peterson29060642009-01-31 22:14:21 +00006435 /* can we encode this? */
6436 if (c<limit) {
6437 /* no overflow check, because we know that the space is enough */
6438 *str++ = (char)c;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006439 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006440 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006441 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006442 Py_ssize_t requiredsize;
6443 PyObject *repunicode;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006444 Py_ssize_t repsize, newpos, respos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006445 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006446 Py_ssize_t collstart = pos;
6447 Py_ssize_t collend = pos;
Benjamin Peterson29060642009-01-31 22:14:21 +00006448 /* find all unecodable characters */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006449 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006450 ++collend;
6451 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6452 if (known_errorHandler==-1) {
6453 if ((errors==NULL) || (!strcmp(errors, "strict")))
6454 known_errorHandler = 1;
6455 else if (!strcmp(errors, "replace"))
6456 known_errorHandler = 2;
6457 else if (!strcmp(errors, "ignore"))
6458 known_errorHandler = 3;
6459 else if (!strcmp(errors, "xmlcharrefreplace"))
6460 known_errorHandler = 4;
6461 else
6462 known_errorHandler = 0;
6463 }
6464 switch (known_errorHandler) {
6465 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006466 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006467 goto onError;
6468 case 2: /* replace */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006469 while (collstart++ < collend)
Benjamin Peterson29060642009-01-31 22:14:21 +00006470 *str++ = '?'; /* fall through */
6471 case 3: /* ignore */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006472 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006473 break;
6474 case 4: /* xmlcharrefreplace */
6475 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006476 requiredsize = respos;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006477 /* determine replacement size */
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006478 for (i = collstart; i < collend; ++i) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006479 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006480 Py_ssize_t incr;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006481 if (ch < 10)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006482 incr = 2+1+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006483 else if (ch < 100)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006484 incr = 2+2+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006485 else if (ch < 1000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006486 incr = 2+3+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006487 else if (ch < 10000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006488 incr = 2+4+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006489 else if (ch < 100000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006490 incr = 2+5+1;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006491 else if (ch < 1000000)
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006492 incr = 2+6+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006493 else {
Victor Stinner8faf8212011-12-08 22:14:11 +01006494 assert(ch <= MAX_UNICODE);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006495 incr = 2+7+1;
Victor Stinner0d3721d2011-11-22 03:27:53 +01006496 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006497 if (requiredsize > PY_SSIZE_T_MAX - incr)
6498 goto overflow;
6499 requiredsize += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +00006500 }
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006501 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6502 goto overflow;
6503 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006504 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006505 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006506 requiredsize = 2*ressize;
6507 if (_PyBytes_Resize(&res, requiredsize))
6508 goto onError;
6509 str = PyBytes_AS_STRING(res) + respos;
6510 ressize = requiredsize;
6511 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006512 /* generate replacement */
6513 for (i = collstart; i < collend; ++i) {
6514 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
Benjamin Peterson29060642009-01-31 22:14:21 +00006515 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006516 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006517 break;
6518 default:
6519 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006520 encoding, reason, unicode, &exc,
6521 collstart, collend, &newpos);
6522 if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
Benjamin Petersonbac79492012-01-14 13:34:47 -05006523 PyUnicode_READY(repunicode) == -1))
Benjamin Peterson29060642009-01-31 22:14:21 +00006524 goto onError;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006525 if (PyBytes_Check(repunicode)) {
6526 /* Directly copy bytes result to output. */
6527 repsize = PyBytes_Size(repunicode);
6528 if (repsize > 1) {
6529 /* Make room for all additional bytes. */
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006530 respos = str - PyBytes_AS_STRING(res);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006531 if (ressize > PY_SSIZE_T_MAX - repsize - 1) {
6532 Py_DECREF(repunicode);
6533 goto overflow;
6534 }
Martin v. Löwis011e8422009-05-05 04:43:17 +00006535 if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6536 Py_DECREF(repunicode);
6537 goto onError;
6538 }
Amaury Forgeot d'Arc84ec8d92009-06-29 22:36:49 +00006539 str = PyBytes_AS_STRING(res) + respos;
Martin v. Löwis011e8422009-05-05 04:43:17 +00006540 ressize += repsize-1;
6541 }
6542 memcpy(str, PyBytes_AsString(repunicode), repsize);
6543 str += repsize;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006544 pos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006545 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00006546 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006547 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006548 /* need more space? (at least enough for what we
6549 have+the replacement+the rest of the string, so
6550 we won't have to check space for encodable characters) */
6551 respos = str - PyBytes_AS_STRING(res);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006552 repsize = PyUnicode_GET_LENGTH(repunicode);
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006553 requiredsize = respos;
6554 if (requiredsize > PY_SSIZE_T_MAX - repsize)
6555 goto overflow;
6556 requiredsize += repsize;
6557 if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6558 goto overflow;
6559 requiredsize += size - collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006560 if (requiredsize > ressize) {
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006561 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
Benjamin Peterson29060642009-01-31 22:14:21 +00006562 requiredsize = 2*ressize;
6563 if (_PyBytes_Resize(&res, requiredsize)) {
6564 Py_DECREF(repunicode);
6565 goto onError;
6566 }
6567 str = PyBytes_AS_STRING(res) + respos;
6568 ressize = requiredsize;
6569 }
6570 /* check if there is anything unencodable in the replacement
6571 and copy it to the output */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006572 for (i = 0; repsize-->0; ++i, ++str) {
6573 c = PyUnicode_READ_CHAR(repunicode, i);
Benjamin Peterson29060642009-01-31 22:14:21 +00006574 if (c >= limit) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006575 raise_encode_exception(&exc, encoding, unicode,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006576 pos, pos+1, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006577 Py_DECREF(repunicode);
6578 goto onError;
6579 }
6580 *str = (char)c;
6581 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006582 pos = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006583 Py_DECREF(repunicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006584 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00006585 }
6586 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006587 /* Resize if we allocated to much */
6588 size = str - PyBytes_AS_STRING(res);
6589 if (size < ressize) { /* If this falls res will be NULL */
Alexandre Vassalottibad1b922008-12-27 09:49:09 +00006590 assert(size >= 0);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006591 if (_PyBytes_Resize(&res, size) < 0)
6592 goto onError;
6593 }
6594
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006595 Py_XDECREF(errorHandler);
6596 Py_XDECREF(exc);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006597 return res;
6598
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006599 overflow:
6600 PyErr_SetString(PyExc_OverflowError,
6601 "encoded result is too long for a Python string");
6602
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006603 onError:
6604 Py_XDECREF(res);
6605 Py_XDECREF(errorHandler);
6606 Py_XDECREF(exc);
6607 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006608}
6609
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006610/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006611PyObject *
6612PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006613 Py_ssize_t size,
6614 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006615{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006616 PyObject *result;
6617 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6618 if (unicode == NULL)
6619 return NULL;
6620 result = unicode_encode_ucs1(unicode, errors, 256);
6621 Py_DECREF(unicode);
6622 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623}
6624
Alexander Belopolsky40018472011-02-26 01:02:56 +00006625PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006626_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627{
6628 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006629 PyErr_BadArgument();
6630 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006632 if (PyUnicode_READY(unicode) == -1)
6633 return NULL;
6634 /* Fast path: if it is a one-byte string, construct
6635 bytes object directly. */
6636 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6637 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6638 PyUnicode_GET_LENGTH(unicode));
6639 /* Non-Latin-1 characters present. Defer to above function to
6640 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006641 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006642}
6643
6644PyObject*
6645PyUnicode_AsLatin1String(PyObject *unicode)
6646{
6647 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648}
6649
6650/* --- 7-bit ASCII Codec -------------------------------------------------- */
6651
Alexander Belopolsky40018472011-02-26 01:02:56 +00006652PyObject *
6653PyUnicode_DecodeASCII(const char *s,
6654 Py_ssize_t size,
6655 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006657 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006658 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006659 int kind;
6660 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006661 Py_ssize_t startinpos;
6662 Py_ssize_t endinpos;
6663 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006664 const char *e;
6665 PyObject *errorHandler = NULL;
6666 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006667
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006669 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006670
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006672 if (size == 1 && (unsigned char)s[0] < 128)
6673 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006674
Victor Stinner8f674cc2013-04-17 23:02:17 +02006675 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006676 writer.min_length = size;
6677 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006678 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006679
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006680 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006681 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006682 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006683 writer.pos = outpos;
6684 if (writer.pos == size)
6685 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02006686
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006687 s += writer.pos;
6688 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006689 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02006690 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00006691 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006692 PyUnicode_WRITE(kind, data, writer.pos, c);
6693 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00006694 ++s;
6695 }
6696 else {
6697 startinpos = s-starts;
6698 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006699 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00006700 errors, &errorHandler,
6701 "ascii", "ordinal not in range(128)",
6702 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006703 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006704 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006705 kind = writer.kind;
6706 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00006707 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006709 Py_XDECREF(errorHandler);
6710 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006711 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006712
Benjamin Peterson29060642009-01-31 22:14:21 +00006713 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006714 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006715 Py_XDECREF(errorHandler);
6716 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717 return NULL;
6718}
6719
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006720/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006721PyObject *
6722PyUnicode_EncodeASCII(const Py_UNICODE *p,
6723 Py_ssize_t size,
6724 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006725{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006726 PyObject *result;
6727 PyObject *unicode = PyUnicode_FromUnicode(p, size);
6728 if (unicode == NULL)
6729 return NULL;
6730 result = unicode_encode_ucs1(unicode, errors, 128);
6731 Py_DECREF(unicode);
6732 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006733}
6734
Alexander Belopolsky40018472011-02-26 01:02:56 +00006735PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006736_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737{
6738 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006739 PyErr_BadArgument();
6740 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006742 if (PyUnicode_READY(unicode) == -1)
6743 return NULL;
6744 /* Fast path: if it is an ASCII-only string, construct bytes object
6745 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02006746 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006747 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6748 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006749 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006750}
6751
6752PyObject *
6753PyUnicode_AsASCIIString(PyObject *unicode)
6754{
6755 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006756}
6757
Victor Stinner99b95382011-07-04 14:23:54 +02006758#ifdef HAVE_MBCS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006759
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00006760/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00006761
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00006762#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006763#define NEED_RETRY
6764#endif
6765
Victor Stinner3a50e702011-10-18 21:21:00 +02006766#ifndef WC_ERR_INVALID_CHARS
6767# define WC_ERR_INVALID_CHARS 0x0080
6768#endif
6769
6770static char*
6771code_page_name(UINT code_page, PyObject **obj)
6772{
6773 *obj = NULL;
6774 if (code_page == CP_ACP)
6775 return "mbcs";
6776 if (code_page == CP_UTF7)
6777 return "CP_UTF7";
6778 if (code_page == CP_UTF8)
6779 return "CP_UTF8";
6780
6781 *obj = PyBytes_FromFormat("cp%u", code_page);
6782 if (*obj == NULL)
6783 return NULL;
6784 return PyBytes_AS_STRING(*obj);
6785}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006786
Victor Stinner3a50e702011-10-18 21:21:00 +02006787static DWORD
6788decode_code_page_flags(UINT code_page)
6789{
6790 if (code_page == CP_UTF7) {
6791 /* The CP_UTF7 decoder only supports flags=0 */
6792 return 0;
6793 }
6794 else
6795 return MB_ERR_INVALID_CHARS;
6796}
6797
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006798/*
Victor Stinner3a50e702011-10-18 21:21:00 +02006799 * Decode a byte string from a Windows code page into unicode object in strict
6800 * mode.
6801 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006802 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6803 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006804 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006805static int
Victor Stinner3a50e702011-10-18 21:21:00 +02006806decode_code_page_strict(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006807 PyObject **v,
Victor Stinner3a50e702011-10-18 21:21:00 +02006808 const char *in,
6809 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006810{
Victor Stinner3a50e702011-10-18 21:21:00 +02006811 const DWORD flags = decode_code_page_flags(code_page);
Victor Stinner24729f32011-11-10 20:31:37 +01006812 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02006813 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006814
6815 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02006816 assert(insize > 0);
6817 outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6818 if (outsize <= 0)
6819 goto error;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006820
6821 if (*v == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006822 /* Create unicode object */
Victor Stinnerab595942011-12-17 04:59:06 +01006823 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006824 *v = (PyObject*)_PyUnicode_New(outsize);
Benjamin Peterson29060642009-01-31 22:14:21 +00006825 if (*v == NULL)
6826 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006827 out = PyUnicode_AS_UNICODE(*v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006828 }
6829 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00006830 /* Extend unicode object */
Victor Stinner3a50e702011-10-18 21:21:00 +02006831 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
Victor Stinner16e6a802011-12-12 13:24:15 +01006832 if (unicode_resize(v, n + outsize) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006833 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02006834 out = PyUnicode_AS_UNICODE(*v) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006835 }
6836
6837 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02006838 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6839 if (outsize <= 0)
6840 goto error;
6841 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00006842
Victor Stinner3a50e702011-10-18 21:21:00 +02006843error:
6844 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6845 return -2;
6846 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00006847 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006848}
6849
Victor Stinner3a50e702011-10-18 21:21:00 +02006850/*
6851 * Decode a byte string from a code page into unicode object with an error
6852 * handler.
6853 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02006854 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02006855 * UnicodeDecodeError exception and returns -1 on error.
6856 */
6857static int
6858decode_code_page_errors(UINT code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006859 PyObject **v,
6860 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01006861 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02006862{
6863 const char *startin = in;
6864 const char *endin = in + size;
6865 const DWORD flags = decode_code_page_flags(code_page);
6866 /* Ideally, we should get reason from FormatMessage. This is the Windows
6867 2000 English version of the message. */
6868 const char *reason = "No mapping for the Unicode character exists "
6869 "in the target code page.";
6870 /* each step cannot decode more than 1 character, but a character can be
6871 represented as a surrogate pair */
6872 wchar_t buffer[2], *startout, *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02006873 int insize;
6874 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02006875 PyObject *errorHandler = NULL;
6876 PyObject *exc = NULL;
6877 PyObject *encoding_obj = NULL;
6878 char *encoding;
6879 DWORD err;
6880 int ret = -1;
6881
6882 assert(size > 0);
6883
6884 encoding = code_page_name(code_page, &encoding_obj);
6885 if (encoding == NULL)
6886 return -1;
6887
Victor Stinner7d00cc12014-03-17 23:08:06 +01006888 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02006889 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6890 UnicodeDecodeError. */
6891 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6892 if (exc != NULL) {
6893 PyCodec_StrictErrors(exc);
6894 Py_CLEAR(exc);
6895 }
6896 goto error;
6897 }
6898
6899 if (*v == NULL) {
6900 /* Create unicode object */
6901 if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6902 PyErr_NoMemory();
6903 goto error;
6904 }
Victor Stinnerab595942011-12-17 04:59:06 +01006905 /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
Victor Stinner76a31a62011-11-04 00:05:13 +01006906 *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
Victor Stinner3a50e702011-10-18 21:21:00 +02006907 if (*v == NULL)
6908 goto error;
6909 startout = PyUnicode_AS_UNICODE(*v);
6910 }
6911 else {
6912 /* Extend unicode object */
6913 Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6914 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6915 PyErr_NoMemory();
6916 goto error;
6917 }
Victor Stinner16e6a802011-12-12 13:24:15 +01006918 if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006919 goto error;
6920 startout = PyUnicode_AS_UNICODE(*v) + n;
6921 }
6922
6923 /* Decode the byte string character per character */
6924 out = startout;
6925 while (in < endin)
6926 {
6927 /* Decode a character */
6928 insize = 1;
6929 do
6930 {
6931 outsize = MultiByteToWideChar(code_page, flags,
6932 in, insize,
6933 buffer, Py_ARRAY_LENGTH(buffer));
6934 if (outsize > 0)
6935 break;
6936 err = GetLastError();
6937 if (err != ERROR_NO_UNICODE_TRANSLATION
6938 && err != ERROR_INSUFFICIENT_BUFFER)
6939 {
6940 PyErr_SetFromWindowsErr(0);
6941 goto error;
6942 }
6943 insize++;
6944 }
6945 /* 4=maximum length of a UTF-8 sequence */
6946 while (insize <= 4 && (in + insize) <= endin);
6947
6948 if (outsize <= 0) {
6949 Py_ssize_t startinpos, endinpos, outpos;
6950
Victor Stinner7d00cc12014-03-17 23:08:06 +01006951 /* last character in partial decode? */
6952 if (in + insize >= endin && !final)
6953 break;
6954
Victor Stinner3a50e702011-10-18 21:21:00 +02006955 startinpos = in - startin;
6956 endinpos = startinpos + 1;
6957 outpos = out - PyUnicode_AS_UNICODE(*v);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006958 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02006959 errors, &errorHandler,
6960 encoding, reason,
6961 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Victor Stinner596a6c42011-11-09 00:02:18 +01006962 v, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02006963 {
6964 goto error;
6965 }
Victor Stinner596a6c42011-11-09 00:02:18 +01006966 out = PyUnicode_AS_UNICODE(*v) + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02006967 }
6968 else {
6969 in += insize;
6970 memcpy(out, buffer, outsize * sizeof(wchar_t));
6971 out += outsize;
6972 }
6973 }
6974
6975 /* write a NUL character at the end */
6976 *out = 0;
6977
6978 /* Extend unicode object */
6979 outsize = out - startout;
6980 assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
Victor Stinner16e6a802011-12-12 13:24:15 +01006981 if (unicode_resize(v, outsize) < 0)
Victor Stinner3a50e702011-10-18 21:21:00 +02006982 goto error;
Victor Stinnere1f17c62014-07-25 14:03:03 +02006983 /* (in - startin) <= size and size is an int */
6984 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02006985
6986error:
6987 Py_XDECREF(encoding_obj);
6988 Py_XDECREF(errorHandler);
6989 Py_XDECREF(exc);
6990 return ret;
6991}
6992
Victor Stinner3a50e702011-10-18 21:21:00 +02006993static PyObject *
6994decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01006995 const char *s, Py_ssize_t size,
6996 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00006997{
Victor Stinner76a31a62011-11-04 00:05:13 +01006998 PyObject *v = NULL;
6999 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007000
Victor Stinner3a50e702011-10-18 21:21:00 +02007001 if (code_page < 0) {
7002 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7003 return NULL;
7004 }
7005
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007006 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007007 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007008
Victor Stinner76a31a62011-11-04 00:05:13 +01007009 do
7010 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007011#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007012 if (size > INT_MAX) {
7013 chunk_size = INT_MAX;
7014 final = 0;
7015 done = 0;
7016 }
7017 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007018#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007019 {
7020 chunk_size = (int)size;
7021 final = (consumed == NULL);
7022 done = 1;
7023 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007024
Victor Stinner76a31a62011-11-04 00:05:13 +01007025 if (chunk_size == 0 && done) {
7026 if (v != NULL)
7027 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007028 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007029 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007030
Victor Stinner76a31a62011-11-04 00:05:13 +01007031 converted = decode_code_page_strict(code_page, &v,
7032 s, chunk_size);
7033 if (converted == -2)
7034 converted = decode_code_page_errors(code_page, &v,
7035 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007036 errors, final);
7037 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007038
7039 if (converted < 0) {
7040 Py_XDECREF(v);
7041 return NULL;
7042 }
7043
7044 if (consumed)
7045 *consumed += converted;
7046
7047 s += converted;
7048 size -= converted;
7049 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007050
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007051 return unicode_result(v);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007052}
7053
Alexander Belopolsky40018472011-02-26 01:02:56 +00007054PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007055PyUnicode_DecodeCodePageStateful(int code_page,
7056 const char *s,
7057 Py_ssize_t size,
7058 const char *errors,
7059 Py_ssize_t *consumed)
7060{
7061 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7062}
7063
7064PyObject *
7065PyUnicode_DecodeMBCSStateful(const char *s,
7066 Py_ssize_t size,
7067 const char *errors,
7068 Py_ssize_t *consumed)
7069{
7070 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7071}
7072
7073PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007074PyUnicode_DecodeMBCS(const char *s,
7075 Py_ssize_t size,
7076 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007077{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007078 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7079}
7080
Victor Stinner3a50e702011-10-18 21:21:00 +02007081static DWORD
7082encode_code_page_flags(UINT code_page, const char *errors)
7083{
7084 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007085 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007086 }
7087 else if (code_page == CP_UTF7) {
7088 /* CP_UTF7 only supports flags=0 */
7089 return 0;
7090 }
7091 else {
7092 if (errors != NULL && strcmp(errors, "replace") == 0)
7093 return 0;
7094 else
7095 return WC_NO_BEST_FIT_CHARS;
7096 }
7097}
7098
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007099/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007100 * Encode a Unicode string to a Windows code page into a byte string in strict
7101 * mode.
7102 *
7103 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007104 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007105 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007106static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007107encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007108 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007109 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007110{
Victor Stinner554f3f02010-06-16 23:33:54 +00007111 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007112 BOOL *pusedDefaultChar = &usedDefaultChar;
7113 int outsize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007114 PyObject *exc = NULL;
Victor Stinner24729f32011-11-10 20:31:37 +01007115 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007116 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007117 const DWORD flags = encode_code_page_flags(code_page, NULL);
7118 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007119 /* Create a substring so that we can get the UTF-16 representation
7120 of just the slice under consideration. */
7121 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007122
Martin v. Löwis3d325192011-11-04 18:23:06 +01007123 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007124
Victor Stinner3a50e702011-10-18 21:21:00 +02007125 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007126 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007127 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007128 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007129
Victor Stinner2fc507f2011-11-04 20:06:39 +01007130 substring = PyUnicode_Substring(unicode, offset, offset+len);
7131 if (substring == NULL)
7132 return -1;
7133 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7134 if (p == NULL) {
7135 Py_DECREF(substring);
7136 return -1;
7137 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007138 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007139
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007140 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007141 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007142 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007143 NULL, 0,
7144 NULL, pusedDefaultChar);
7145 if (outsize <= 0)
7146 goto error;
7147 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007148 if (pusedDefaultChar && *pusedDefaultChar) {
7149 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007150 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007151 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007152
Victor Stinner3a50e702011-10-18 21:21:00 +02007153 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007154 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007155 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007156 if (*outbytes == NULL) {
7157 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007158 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007159 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007160 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007161 }
7162 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007163 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007164 const Py_ssize_t n = PyBytes_Size(*outbytes);
7165 if (outsize > PY_SSIZE_T_MAX - n) {
7166 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007167 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007168 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007169 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007170 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7171 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007172 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007173 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007174 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007175 }
7176
7177 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007178 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007179 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007180 out, outsize,
7181 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007182 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007183 if (outsize <= 0)
7184 goto error;
7185 if (pusedDefaultChar && *pusedDefaultChar)
7186 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007187 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007188
Victor Stinner3a50e702011-10-18 21:21:00 +02007189error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007190 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007191 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7192 return -2;
7193 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007194 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007195}
7196
Victor Stinner3a50e702011-10-18 21:21:00 +02007197/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007198 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007199 * error handler.
7200 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007201 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007202 * -1 on other error.
7203 */
7204static int
7205encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007206 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007207 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007208{
Victor Stinner3a50e702011-10-18 21:21:00 +02007209 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007210 Py_ssize_t pos = unicode_offset;
7211 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007212 /* Ideally, we should get reason from FormatMessage. This is the Windows
7213 2000 English version of the message. */
7214 const char *reason = "invalid character";
7215 /* 4=maximum length of a UTF-8 sequence */
7216 char buffer[4];
7217 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7218 Py_ssize_t outsize;
7219 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007220 PyObject *errorHandler = NULL;
7221 PyObject *exc = NULL;
7222 PyObject *encoding_obj = NULL;
7223 char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007224 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007225 PyObject *rep;
7226 int ret = -1;
7227
7228 assert(insize > 0);
7229
7230 encoding = code_page_name(code_page, &encoding_obj);
7231 if (encoding == NULL)
7232 return -1;
7233
7234 if (errors == NULL || strcmp(errors, "strict") == 0) {
7235 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7236 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007237 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007238 if (exc != NULL) {
7239 PyCodec_StrictErrors(exc);
7240 Py_DECREF(exc);
7241 }
7242 Py_XDECREF(encoding_obj);
7243 return -1;
7244 }
7245
7246 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7247 pusedDefaultChar = &usedDefaultChar;
7248 else
7249 pusedDefaultChar = NULL;
7250
7251 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7252 PyErr_NoMemory();
7253 goto error;
7254 }
7255 outsize = insize * Py_ARRAY_LENGTH(buffer);
7256
7257 if (*outbytes == NULL) {
7258 /* Create string object */
7259 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7260 if (*outbytes == NULL)
7261 goto error;
7262 out = PyBytes_AS_STRING(*outbytes);
7263 }
7264 else {
7265 /* Extend string object */
7266 Py_ssize_t n = PyBytes_Size(*outbytes);
7267 if (n > PY_SSIZE_T_MAX - outsize) {
7268 PyErr_NoMemory();
7269 goto error;
7270 }
7271 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7272 goto error;
7273 out = PyBytes_AS_STRING(*outbytes) + n;
7274 }
7275
7276 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007277 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007278 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007279 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7280 wchar_t chars[2];
7281 int charsize;
7282 if (ch < 0x10000) {
7283 chars[0] = (wchar_t)ch;
7284 charsize = 1;
7285 }
7286 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007287 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7288 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007289 charsize = 2;
7290 }
7291
Victor Stinner3a50e702011-10-18 21:21:00 +02007292 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007293 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007294 buffer, Py_ARRAY_LENGTH(buffer),
7295 NULL, pusedDefaultChar);
7296 if (outsize > 0) {
7297 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7298 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007299 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007300 memcpy(out, buffer, outsize);
7301 out += outsize;
7302 continue;
7303 }
7304 }
7305 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7306 PyErr_SetFromWindowsErr(0);
7307 goto error;
7308 }
7309
Victor Stinner3a50e702011-10-18 21:21:00 +02007310 rep = unicode_encode_call_errorhandler(
7311 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007312 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007313 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007314 if (rep == NULL)
7315 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007316 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007317
7318 if (PyBytes_Check(rep)) {
7319 outsize = PyBytes_GET_SIZE(rep);
7320 if (outsize != 1) {
7321 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7322 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7323 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7324 Py_DECREF(rep);
7325 goto error;
7326 }
7327 out = PyBytes_AS_STRING(*outbytes) + offset;
7328 }
7329 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7330 out += outsize;
7331 }
7332 else {
7333 Py_ssize_t i;
7334 enum PyUnicode_Kind kind;
7335 void *data;
7336
Benjamin Petersonbac79492012-01-14 13:34:47 -05007337 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007338 Py_DECREF(rep);
7339 goto error;
7340 }
7341
7342 outsize = PyUnicode_GET_LENGTH(rep);
7343 if (outsize != 1) {
7344 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7345 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7346 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7347 Py_DECREF(rep);
7348 goto error;
7349 }
7350 out = PyBytes_AS_STRING(*outbytes) + offset;
7351 }
7352 kind = PyUnicode_KIND(rep);
7353 data = PyUnicode_DATA(rep);
7354 for (i=0; i < outsize; i++) {
7355 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7356 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007357 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007358 encoding, unicode,
7359 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007360 "unable to encode error handler result to ASCII");
7361 Py_DECREF(rep);
7362 goto error;
7363 }
7364 *out = (unsigned char)ch;
7365 out++;
7366 }
7367 }
7368 Py_DECREF(rep);
7369 }
7370 /* write a NUL byte */
7371 *out = 0;
7372 outsize = out - PyBytes_AS_STRING(*outbytes);
7373 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7374 if (_PyBytes_Resize(outbytes, outsize) < 0)
7375 goto error;
7376 ret = 0;
7377
7378error:
7379 Py_XDECREF(encoding_obj);
7380 Py_XDECREF(errorHandler);
7381 Py_XDECREF(exc);
7382 return ret;
7383}
7384
Victor Stinner3a50e702011-10-18 21:21:00 +02007385static PyObject *
7386encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007387 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007388 const char *errors)
7389{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007390 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007391 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007392 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007393 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007394
Victor Stinner29dacf22015-01-26 16:41:32 +01007395 if (!PyUnicode_Check(unicode)) {
7396 PyErr_BadArgument();
7397 return NULL;
7398 }
7399
Benjamin Petersonbac79492012-01-14 13:34:47 -05007400 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007401 return NULL;
7402 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007403
Victor Stinner3a50e702011-10-18 21:21:00 +02007404 if (code_page < 0) {
7405 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7406 return NULL;
7407 }
7408
Martin v. Löwis3d325192011-11-04 18:23:06 +01007409 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007410 return PyBytes_FromStringAndSize(NULL, 0);
7411
Victor Stinner7581cef2011-11-03 22:32:33 +01007412 offset = 0;
7413 do
7414 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007415#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007416 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007417 chunks. */
7418 if (len > INT_MAX/2) {
7419 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007420 done = 0;
7421 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007422 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007423#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007424 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007425 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007426 done = 1;
7427 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007428
Victor Stinner76a31a62011-11-04 00:05:13 +01007429 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007430 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007431 errors);
7432 if (ret == -2)
7433 ret = encode_code_page_errors(code_page, &outbytes,
7434 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007435 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007436 if (ret < 0) {
7437 Py_XDECREF(outbytes);
7438 return NULL;
7439 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007440
Victor Stinner7581cef2011-11-03 22:32:33 +01007441 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007442 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007443 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007444
Victor Stinner3a50e702011-10-18 21:21:00 +02007445 return outbytes;
7446}
7447
7448PyObject *
7449PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7450 Py_ssize_t size,
7451 const char *errors)
7452{
Victor Stinner7581cef2011-11-03 22:32:33 +01007453 PyObject *unicode, *res;
7454 unicode = PyUnicode_FromUnicode(p, size);
7455 if (unicode == NULL)
7456 return NULL;
7457 res = encode_code_page(CP_ACP, unicode, errors);
7458 Py_DECREF(unicode);
7459 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007460}
7461
7462PyObject *
7463PyUnicode_EncodeCodePage(int code_page,
7464 PyObject *unicode,
7465 const char *errors)
7466{
Victor Stinner7581cef2011-11-03 22:32:33 +01007467 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007468}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007469
Alexander Belopolsky40018472011-02-26 01:02:56 +00007470PyObject *
7471PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007472{
Victor Stinner7581cef2011-11-03 22:32:33 +01007473 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007474}
7475
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007476#undef NEED_RETRY
7477
Victor Stinner99b95382011-07-04 14:23:54 +02007478#endif /* HAVE_MBCS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007479
Guido van Rossumd57fd912000-03-10 22:53:23 +00007480/* --- Character Mapping Codec -------------------------------------------- */
7481
Victor Stinnerfb161b12013-04-18 01:44:27 +02007482static int
7483charmap_decode_string(const char *s,
7484 Py_ssize_t size,
7485 PyObject *mapping,
7486 const char *errors,
7487 _PyUnicodeWriter *writer)
7488{
7489 const char *starts = s;
7490 const char *e;
7491 Py_ssize_t startinpos, endinpos;
7492 PyObject *errorHandler = NULL, *exc = NULL;
7493 Py_ssize_t maplen;
7494 enum PyUnicode_Kind mapkind;
7495 void *mapdata;
7496 Py_UCS4 x;
7497 unsigned char ch;
7498
7499 if (PyUnicode_READY(mapping) == -1)
7500 return -1;
7501
7502 maplen = PyUnicode_GET_LENGTH(mapping);
7503 mapdata = PyUnicode_DATA(mapping);
7504 mapkind = PyUnicode_KIND(mapping);
7505
7506 e = s + size;
7507
7508 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7509 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7510 * is disabled in encoding aliases, latin1 is preferred because
7511 * its implementation is faster. */
7512 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7513 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7514 Py_UCS4 maxchar = writer->maxchar;
7515
7516 assert (writer->kind == PyUnicode_1BYTE_KIND);
7517 while (s < e) {
7518 ch = *s;
7519 x = mapdata_ucs1[ch];
7520 if (x > maxchar) {
7521 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7522 goto onError;
7523 maxchar = writer->maxchar;
7524 outdata = (Py_UCS1 *)writer->data;
7525 }
7526 outdata[writer->pos] = x;
7527 writer->pos++;
7528 ++s;
7529 }
7530 return 0;
7531 }
7532
7533 while (s < e) {
7534 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7535 enum PyUnicode_Kind outkind = writer->kind;
7536 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7537 if (outkind == PyUnicode_1BYTE_KIND) {
7538 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7539 Py_UCS4 maxchar = writer->maxchar;
7540 while (s < e) {
7541 ch = *s;
7542 x = mapdata_ucs2[ch];
7543 if (x > maxchar)
7544 goto Error;
7545 outdata[writer->pos] = x;
7546 writer->pos++;
7547 ++s;
7548 }
7549 break;
7550 }
7551 else if (outkind == PyUnicode_2BYTE_KIND) {
7552 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7553 while (s < e) {
7554 ch = *s;
7555 x = mapdata_ucs2[ch];
7556 if (x == 0xFFFE)
7557 goto Error;
7558 outdata[writer->pos] = x;
7559 writer->pos++;
7560 ++s;
7561 }
7562 break;
7563 }
7564 }
7565 ch = *s;
7566
7567 if (ch < maplen)
7568 x = PyUnicode_READ(mapkind, mapdata, ch);
7569 else
7570 x = 0xfffe; /* invalid value */
7571Error:
7572 if (x == 0xfffe)
7573 {
7574 /* undefined mapping */
7575 startinpos = s-starts;
7576 endinpos = startinpos+1;
7577 if (unicode_decode_call_errorhandler_writer(
7578 errors, &errorHandler,
7579 "charmap", "character maps to <undefined>",
7580 &starts, &e, &startinpos, &endinpos, &exc, &s,
7581 writer)) {
7582 goto onError;
7583 }
7584 continue;
7585 }
7586
7587 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7588 goto onError;
7589 ++s;
7590 }
7591 Py_XDECREF(errorHandler);
7592 Py_XDECREF(exc);
7593 return 0;
7594
7595onError:
7596 Py_XDECREF(errorHandler);
7597 Py_XDECREF(exc);
7598 return -1;
7599}
7600
7601static int
7602charmap_decode_mapping(const char *s,
7603 Py_ssize_t size,
7604 PyObject *mapping,
7605 const char *errors,
7606 _PyUnicodeWriter *writer)
7607{
7608 const char *starts = s;
7609 const char *e;
7610 Py_ssize_t startinpos, endinpos;
7611 PyObject *errorHandler = NULL, *exc = NULL;
7612 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007613 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007614
7615 e = s + size;
7616
7617 while (s < e) {
7618 ch = *s;
7619
7620 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7621 key = PyLong_FromLong((long)ch);
7622 if (key == NULL)
7623 goto onError;
7624
7625 item = PyObject_GetItem(mapping, key);
7626 Py_DECREF(key);
7627 if (item == NULL) {
7628 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7629 /* No mapping found means: mapping is undefined. */
7630 PyErr_Clear();
7631 goto Undefined;
7632 } else
7633 goto onError;
7634 }
7635
7636 /* Apply mapping */
7637 if (item == Py_None)
7638 goto Undefined;
7639 if (PyLong_Check(item)) {
7640 long value = PyLong_AS_LONG(item);
7641 if (value == 0xFFFE)
7642 goto Undefined;
7643 if (value < 0 || value > MAX_UNICODE) {
7644 PyErr_Format(PyExc_TypeError,
7645 "character mapping must be in range(0x%lx)",
7646 (unsigned long)MAX_UNICODE + 1);
7647 goto onError;
7648 }
7649
7650 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7651 goto onError;
7652 }
7653 else if (PyUnicode_Check(item)) {
7654 if (PyUnicode_READY(item) == -1)
7655 goto onError;
7656 if (PyUnicode_GET_LENGTH(item) == 1) {
7657 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7658 if (value == 0xFFFE)
7659 goto Undefined;
7660 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7661 goto onError;
7662 }
7663 else {
7664 writer->overallocate = 1;
7665 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7666 goto onError;
7667 }
7668 }
7669 else {
7670 /* wrong return value */
7671 PyErr_SetString(PyExc_TypeError,
7672 "character mapping must return integer, None or str");
7673 goto onError;
7674 }
7675 Py_CLEAR(item);
7676 ++s;
7677 continue;
7678
7679Undefined:
7680 /* undefined mapping */
7681 Py_CLEAR(item);
7682 startinpos = s-starts;
7683 endinpos = startinpos+1;
7684 if (unicode_decode_call_errorhandler_writer(
7685 errors, &errorHandler,
7686 "charmap", "character maps to <undefined>",
7687 &starts, &e, &startinpos, &endinpos, &exc, &s,
7688 writer)) {
7689 goto onError;
7690 }
7691 }
7692 Py_XDECREF(errorHandler);
7693 Py_XDECREF(exc);
7694 return 0;
7695
7696onError:
7697 Py_XDECREF(item);
7698 Py_XDECREF(errorHandler);
7699 Py_XDECREF(exc);
7700 return -1;
7701}
7702
Alexander Belopolsky40018472011-02-26 01:02:56 +00007703PyObject *
7704PyUnicode_DecodeCharmap(const char *s,
7705 Py_ssize_t size,
7706 PyObject *mapping,
7707 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007708{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007709 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00007710
Guido van Rossumd57fd912000-03-10 22:53:23 +00007711 /* Default to Latin-1 */
7712 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007713 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007714
Guido van Rossumd57fd912000-03-10 22:53:23 +00007715 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02007716 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02007717 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007718 writer.min_length = size;
7719 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007720 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007721
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007722 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007723 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7724 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00007725 }
7726 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02007727 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7728 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007729 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007730 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007731
Benjamin Peterson29060642009-01-31 22:14:21 +00007732 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007733 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007734 return NULL;
7735}
7736
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007737/* Charmap encoding: the lookup table */
7738
Alexander Belopolsky40018472011-02-26 01:02:56 +00007739struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00007740 PyObject_HEAD
7741 unsigned char level1[32];
7742 int count2, count3;
7743 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007744};
7745
7746static PyObject*
7747encoding_map_size(PyObject *obj, PyObject* args)
7748{
7749 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007750 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00007751 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007752}
7753
7754static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007755 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00007756 PyDoc_STR("Return the size (in bytes) of this object") },
7757 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007758};
7759
7760static void
7761encoding_map_dealloc(PyObject* o)
7762{
Benjamin Peterson14339b62009-01-31 16:36:08 +00007763 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007764}
7765
7766static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00007767 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007768 "EncodingMap", /*tp_name*/
7769 sizeof(struct encoding_map), /*tp_basicsize*/
7770 0, /*tp_itemsize*/
7771 /* methods */
7772 encoding_map_dealloc, /*tp_dealloc*/
7773 0, /*tp_print*/
7774 0, /*tp_getattr*/
7775 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00007776 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00007777 0, /*tp_repr*/
7778 0, /*tp_as_number*/
7779 0, /*tp_as_sequence*/
7780 0, /*tp_as_mapping*/
7781 0, /*tp_hash*/
7782 0, /*tp_call*/
7783 0, /*tp_str*/
7784 0, /*tp_getattro*/
7785 0, /*tp_setattro*/
7786 0, /*tp_as_buffer*/
7787 Py_TPFLAGS_DEFAULT, /*tp_flags*/
7788 0, /*tp_doc*/
7789 0, /*tp_traverse*/
7790 0, /*tp_clear*/
7791 0, /*tp_richcompare*/
7792 0, /*tp_weaklistoffset*/
7793 0, /*tp_iter*/
7794 0, /*tp_iternext*/
7795 encoding_map_methods, /*tp_methods*/
7796 0, /*tp_members*/
7797 0, /*tp_getset*/
7798 0, /*tp_base*/
7799 0, /*tp_dict*/
7800 0, /*tp_descr_get*/
7801 0, /*tp_descr_set*/
7802 0, /*tp_dictoffset*/
7803 0, /*tp_init*/
7804 0, /*tp_alloc*/
7805 0, /*tp_new*/
7806 0, /*tp_free*/
7807 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007808};
7809
7810PyObject*
7811PyUnicode_BuildEncodingMap(PyObject* string)
7812{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007813 PyObject *result;
7814 struct encoding_map *mresult;
7815 int i;
7816 int need_dict = 0;
7817 unsigned char level1[32];
7818 unsigned char level2[512];
7819 unsigned char *mlevel1, *mlevel2, *mlevel3;
7820 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007821 int kind;
7822 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007823 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007824 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007825
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007826 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007827 PyErr_BadArgument();
7828 return NULL;
7829 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007830 kind = PyUnicode_KIND(string);
7831 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007832 length = PyUnicode_GET_LENGTH(string);
7833 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007834 memset(level1, 0xFF, sizeof level1);
7835 memset(level2, 0xFF, sizeof level2);
7836
7837 /* If there isn't a one-to-one mapping of NULL to \0,
7838 or if there are non-BMP characters, we need to use
7839 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007840 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007841 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007842 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007843 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007844 ch = PyUnicode_READ(kind, data, i);
7845 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007846 need_dict = 1;
7847 break;
7848 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007849 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007850 /* unmapped character */
7851 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007852 l1 = ch >> 11;
7853 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007854 if (level1[l1] == 0xFF)
7855 level1[l1] = count2++;
7856 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00007857 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007858 }
7859
7860 if (count2 >= 0xFF || count3 >= 0xFF)
7861 need_dict = 1;
7862
7863 if (need_dict) {
7864 PyObject *result = PyDict_New();
7865 PyObject *key, *value;
7866 if (!result)
7867 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007868 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007869 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00007870 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007871 if (!key || !value)
7872 goto failed1;
7873 if (PyDict_SetItem(result, key, value) == -1)
7874 goto failed1;
7875 Py_DECREF(key);
7876 Py_DECREF(value);
7877 }
7878 return result;
7879 failed1:
7880 Py_XDECREF(key);
7881 Py_XDECREF(value);
7882 Py_DECREF(result);
7883 return NULL;
7884 }
7885
7886 /* Create a three-level trie */
7887 result = PyObject_MALLOC(sizeof(struct encoding_map) +
7888 16*count2 + 128*count3 - 1);
7889 if (!result)
7890 return PyErr_NoMemory();
7891 PyObject_Init(result, &EncodingMapType);
7892 mresult = (struct encoding_map*)result;
7893 mresult->count2 = count2;
7894 mresult->count3 = count3;
7895 mlevel1 = mresult->level1;
7896 mlevel2 = mresult->level23;
7897 mlevel3 = mresult->level23 + 16*count2;
7898 memcpy(mlevel1, level1, 32);
7899 memset(mlevel2, 0xFF, 16*count2);
7900 memset(mlevel3, 0, 128*count3);
7901 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007902 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007903 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007904 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7905 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007906 /* unmapped character */
7907 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007908 o1 = ch>>11;
7909 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007910 i2 = 16*mlevel1[o1] + o2;
7911 if (mlevel2[i2] == 0xFF)
7912 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02007913 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007914 i3 = 128*mlevel2[i2] + o3;
7915 mlevel3[i3] = i;
7916 }
7917 return result;
7918}
7919
7920static int
Victor Stinner22168992011-11-20 17:09:18 +01007921encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007922{
7923 struct encoding_map *map = (struct encoding_map*)mapping;
7924 int l1 = c>>11;
7925 int l2 = (c>>7) & 0xF;
7926 int l3 = c & 0x7F;
7927 int i;
7928
Victor Stinner22168992011-11-20 17:09:18 +01007929 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00007930 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007931 if (c == 0)
7932 return 0;
7933 /* level 1*/
7934 i = map->level1[l1];
7935 if (i == 0xFF) {
7936 return -1;
7937 }
7938 /* level 2*/
7939 i = map->level23[16*i+l2];
7940 if (i == 0xFF) {
7941 return -1;
7942 }
7943 /* level 3 */
7944 i = map->level23[16*map->count2 + 128*i + l3];
7945 if (i == 0) {
7946 return -1;
7947 }
7948 return i;
7949}
7950
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007951/* Lookup the character ch in the mapping. If the character
7952 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00007953 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007954static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01007955charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007956{
Christian Heimes217cfd12007-12-02 14:31:20 +00007957 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007958 PyObject *x;
7959
7960 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007961 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007962 x = PyObject_GetItem(mapping, w);
7963 Py_DECREF(w);
7964 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007965 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7966 /* No mapping found means: mapping is undefined. */
7967 PyErr_Clear();
7968 x = Py_None;
7969 Py_INCREF(x);
7970 return x;
7971 } else
7972 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007973 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00007974 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00007975 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00007976 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007977 long value = PyLong_AS_LONG(x);
7978 if (value < 0 || value > 255) {
7979 PyErr_SetString(PyExc_TypeError,
7980 "character mapping must be in range(256)");
7981 Py_DECREF(x);
7982 return NULL;
7983 }
7984 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007985 }
Christian Heimes72b710a2008-05-26 13:28:38 +00007986 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00007987 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007988 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007989 /* wrong return value */
7990 PyErr_Format(PyExc_TypeError,
7991 "character mapping must return integer, bytes or None, not %.400s",
7992 x->ob_type->tp_name);
7993 Py_DECREF(x);
7994 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007995 }
7996}
7997
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00007998static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00007999charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008000{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008001 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8002 /* exponentially overallocate to minimize reallocations */
8003 if (requiredsize < 2*outsize)
8004 requiredsize = 2*outsize;
8005 if (_PyBytes_Resize(outobj, requiredsize))
8006 return -1;
8007 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008008}
8009
Benjamin Peterson14339b62009-01-31 16:36:08 +00008010typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008011 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008012} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008013/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008014 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008015 space is available. Return a new reference to the object that
8016 was put in the output buffer, or Py_None, if the mapping was undefined
8017 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008018 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008019static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008020charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008021 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008022{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008023 PyObject *rep;
8024 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008025 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008026
Christian Heimes90aa7642007-12-19 02:45:37 +00008027 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008028 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008029 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008030 if (res == -1)
8031 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008032 if (outsize<requiredsize)
8033 if (charmapencode_resize(outobj, outpos, requiredsize))
8034 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008035 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008036 outstart[(*outpos)++] = (char)res;
8037 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008038 }
8039
8040 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008041 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008042 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008043 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008044 Py_DECREF(rep);
8045 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008046 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008047 if (PyLong_Check(rep)) {
8048 Py_ssize_t requiredsize = *outpos+1;
8049 if (outsize<requiredsize)
8050 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8051 Py_DECREF(rep);
8052 return enc_EXCEPTION;
8053 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008054 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008055 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008056 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008057 else {
8058 const char *repchars = PyBytes_AS_STRING(rep);
8059 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8060 Py_ssize_t requiredsize = *outpos+repsize;
8061 if (outsize<requiredsize)
8062 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8063 Py_DECREF(rep);
8064 return enc_EXCEPTION;
8065 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008066 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008067 memcpy(outstart + *outpos, repchars, repsize);
8068 *outpos += repsize;
8069 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008070 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008071 Py_DECREF(rep);
8072 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008073}
8074
8075/* handle an error in PyUnicode_EncodeCharmap
8076 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008077static int
8078charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008079 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008080 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00008081 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008082 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008083{
8084 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008085 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008086 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008087 enum PyUnicode_Kind kind;
8088 void *data;
8089 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008090 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008091 Py_ssize_t collstartpos = *inpos;
8092 Py_ssize_t collendpos = *inpos+1;
8093 Py_ssize_t collpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008094 char *encoding = "charmap";
8095 char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008096 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008097 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008098 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008099
Benjamin Petersonbac79492012-01-14 13:34:47 -05008100 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008101 return -1;
8102 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008103 /* find all unencodable characters */
8104 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008105 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008106 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008107 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008108 val = encoding_map_lookup(ch, mapping);
8109 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008110 break;
8111 ++collendpos;
8112 continue;
8113 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008114
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008115 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8116 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008117 if (rep==NULL)
8118 return -1;
8119 else if (rep!=Py_None) {
8120 Py_DECREF(rep);
8121 break;
8122 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008123 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008124 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008125 }
8126 /* cache callback name lookup
8127 * (if not done yet, i.e. it's the first error) */
8128 if (*known_errorHandler==-1) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008129 if ((errors==NULL) || (!strcmp(errors, "strict")))
8130 *known_errorHandler = 1;
8131 else if (!strcmp(errors, "replace"))
8132 *known_errorHandler = 2;
8133 else if (!strcmp(errors, "ignore"))
8134 *known_errorHandler = 3;
8135 else if (!strcmp(errors, "xmlcharrefreplace"))
8136 *known_errorHandler = 4;
8137 else
8138 *known_errorHandler = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008139 }
8140 switch (*known_errorHandler) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008141 case 1: /* strict */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008142 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008143 return -1;
8144 case 2: /* replace */
8145 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008146 x = charmapencode_output('?', mapping, res, respos);
8147 if (x==enc_EXCEPTION) {
8148 return -1;
8149 }
8150 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008151 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008152 return -1;
8153 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008154 }
8155 /* fall through */
8156 case 3: /* ignore */
8157 *inpos = collendpos;
8158 break;
8159 case 4: /* xmlcharrefreplace */
8160 /* generate replacement (temporarily (mis)uses p) */
8161 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008162 char buffer[2+29+1+1];
8163 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008164 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008165 for (cp = buffer; *cp; ++cp) {
8166 x = charmapencode_output(*cp, mapping, res, respos);
8167 if (x==enc_EXCEPTION)
8168 return -1;
8169 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008170 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008171 return -1;
8172 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008173 }
8174 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008175 *inpos = collendpos;
8176 break;
8177 default:
8178 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008179 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008180 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008181 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008182 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008183 if (PyBytes_Check(repunicode)) {
8184 /* Directly copy bytes result to output. */
8185 Py_ssize_t outsize = PyBytes_Size(*res);
8186 Py_ssize_t requiredsize;
8187 repsize = PyBytes_Size(repunicode);
8188 requiredsize = *respos + repsize;
8189 if (requiredsize > outsize)
8190 /* Make room for all additional bytes. */
8191 if (charmapencode_resize(res, respos, requiredsize)) {
8192 Py_DECREF(repunicode);
8193 return -1;
8194 }
8195 memcpy(PyBytes_AsString(*res) + *respos,
8196 PyBytes_AsString(repunicode), repsize);
8197 *respos += repsize;
8198 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008199 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008200 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008201 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008202 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008203 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008204 Py_DECREF(repunicode);
8205 return -1;
8206 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008207 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008208 data = PyUnicode_DATA(repunicode);
8209 kind = PyUnicode_KIND(repunicode);
8210 for (index = 0; index < repsize; index++) {
8211 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8212 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008213 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008214 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008215 return -1;
8216 }
8217 else if (x==enc_FAILED) {
8218 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008219 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008220 return -1;
8221 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008222 }
8223 *inpos = newpos;
8224 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008225 }
8226 return 0;
8227}
8228
Alexander Belopolsky40018472011-02-26 01:02:56 +00008229PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008230_PyUnicode_EncodeCharmap(PyObject *unicode,
8231 PyObject *mapping,
8232 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008233{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008234 /* output object */
8235 PyObject *res = NULL;
8236 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008237 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008238 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008239 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008240 Py_ssize_t respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008241 PyObject *errorHandler = NULL;
8242 PyObject *exc = NULL;
8243 /* the following variable is used for caching string comparisons
8244 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8245 * 3=ignore, 4=xmlcharrefreplace */
8246 int known_errorHandler = -1;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008247 void *data;
8248 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008249
Benjamin Petersonbac79492012-01-14 13:34:47 -05008250 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008251 return NULL;
8252 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008253 data = PyUnicode_DATA(unicode);
8254 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008255
Guido van Rossumd57fd912000-03-10 22:53:23 +00008256 /* Default to Latin-1 */
8257 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008258 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008259
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008260 /* allocate enough for a simple encoding without
8261 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008262 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008263 if (res == NULL)
8264 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008265 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008266 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008267
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008268 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008269 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008270 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008271 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008272 if (x==enc_EXCEPTION) /* error */
8273 goto onError;
8274 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008275 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008276 &exc,
8277 &known_errorHandler, &errorHandler, errors,
8278 &res, &respos)) {
8279 goto onError;
8280 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008281 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008282 else
8283 /* done with this character => adjust input position */
8284 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008285 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008286
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008287 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008288 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008289 if (_PyBytes_Resize(&res, respos) < 0)
8290 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008291
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008292 Py_XDECREF(exc);
8293 Py_XDECREF(errorHandler);
8294 return res;
8295
Benjamin Peterson29060642009-01-31 22:14:21 +00008296 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008297 Py_XDECREF(res);
8298 Py_XDECREF(exc);
8299 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300 return NULL;
8301}
8302
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008303/* Deprecated */
8304PyObject *
8305PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8306 Py_ssize_t size,
8307 PyObject *mapping,
8308 const char *errors)
8309{
8310 PyObject *result;
8311 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8312 if (unicode == NULL)
8313 return NULL;
8314 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8315 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008316 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008317}
8318
Alexander Belopolsky40018472011-02-26 01:02:56 +00008319PyObject *
8320PyUnicode_AsCharmapString(PyObject *unicode,
8321 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322{
8323 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008324 PyErr_BadArgument();
8325 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008326 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008327 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328}
8329
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008330/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008331static void
8332make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008333 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008334 Py_ssize_t startpos, Py_ssize_t endpos,
8335 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008336{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008337 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008338 *exceptionObject = _PyUnicodeTranslateError_Create(
8339 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340 }
8341 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008342 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8343 goto onError;
8344 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8345 goto onError;
8346 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8347 goto onError;
8348 return;
8349 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008350 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008351 }
8352}
8353
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008354/* error handling callback helper:
8355 build arguments, call the callback and check the arguments,
8356 put the result into newpos and return the replacement string, which
8357 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008358static PyObject *
8359unicode_translate_call_errorhandler(const char *errors,
8360 PyObject **errorHandler,
8361 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008362 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008363 Py_ssize_t startpos, Py_ssize_t endpos,
8364 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008365{
Benjamin Peterson142957c2008-07-04 19:55:29 +00008366 static char *argparse = "O!n;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008367
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008368 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008369 PyObject *restuple;
8370 PyObject *resunicode;
8371
8372 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008373 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008374 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008375 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008376 }
8377
8378 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008379 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008380 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008382
8383 restuple = PyObject_CallFunctionObjArgs(
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008385 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008386 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008387 if (!PyTuple_Check(restuple)) {
Benjamin Petersond75fcb42009-02-19 04:22:03 +00008388 PyErr_SetString(PyExc_TypeError, &argparse[4]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 Py_DECREF(restuple);
8390 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008391 }
8392 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
Benjamin Peterson29060642009-01-31 22:14:21 +00008393 &resunicode, &i_newpos)) {
8394 Py_DECREF(restuple);
8395 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008396 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008397 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008398 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008399 else
8400 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008401 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008402 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008403 Py_DECREF(restuple);
8404 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008405 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008406 Py_INCREF(resunicode);
8407 Py_DECREF(restuple);
8408 return resunicode;
8409}
8410
8411/* Lookup the character ch in the mapping and put the result in result,
8412 which must be decrefed by the caller.
8413 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008414static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008415charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008416{
Christian Heimes217cfd12007-12-02 14:31:20 +00008417 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008418 PyObject *x;
8419
8420 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008422 x = PyObject_GetItem(mapping, w);
8423 Py_DECREF(w);
8424 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008425 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8426 /* No mapping found means: use 1:1 mapping. */
8427 PyErr_Clear();
8428 *result = NULL;
8429 return 0;
8430 } else
8431 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008432 }
8433 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008434 *result = x;
8435 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008436 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008437 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008438 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008439 if (value < 0 || value > MAX_UNICODE) {
8440 PyErr_Format(PyExc_ValueError,
8441 "character mapping must be in range(0x%x)",
8442 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 Py_DECREF(x);
8444 return -1;
8445 }
8446 *result = x;
8447 return 0;
8448 }
8449 else if (PyUnicode_Check(x)) {
8450 *result = x;
8451 return 0;
8452 }
8453 else {
8454 /* wrong return value */
8455 PyErr_SetString(PyExc_TypeError,
8456 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008457 Py_DECREF(x);
8458 return -1;
8459 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008460}
Victor Stinner1194ea02014-04-04 19:37:40 +02008461
8462/* lookup the character, write the result into the writer.
8463 Return 1 if the result was written into the writer, return 0 if the mapping
8464 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008465static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008466charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8467 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008468{
Victor Stinner1194ea02014-04-04 19:37:40 +02008469 PyObject *item;
8470
8471 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008473
8474 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008475 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008476 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008477 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008478 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008479 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008480 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008481
8482 if (item == Py_None) {
8483 Py_DECREF(item);
8484 return 0;
8485 }
8486
8487 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008488 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8489 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8490 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008491 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8492 Py_DECREF(item);
8493 return -1;
8494 }
8495 Py_DECREF(item);
8496 return 1;
8497 }
8498
8499 if (!PyUnicode_Check(item)) {
8500 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008501 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008502 }
8503
8504 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8505 Py_DECREF(item);
8506 return -1;
8507 }
8508
8509 Py_DECREF(item);
8510 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008511}
8512
Victor Stinner89a76ab2014-04-05 11:44:04 +02008513static int
8514unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8515 Py_UCS1 *translate)
8516{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008517 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008518 int ret = 0;
8519
Victor Stinner89a76ab2014-04-05 11:44:04 +02008520 if (charmaptranslate_lookup(ch, mapping, &item)) {
8521 return -1;
8522 }
8523
8524 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008525 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008526 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008527 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008528 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008529 /* not found => default to 1:1 mapping */
8530 translate[ch] = ch;
8531 return 1;
8532 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008533 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008534 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008535 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8536 used it */
8537 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008538 /* invalid character or character outside ASCII:
8539 skip the fast translate */
8540 goto exit;
8541 }
8542 translate[ch] = (Py_UCS1)replace;
8543 }
8544 else if (PyUnicode_Check(item)) {
8545 Py_UCS4 replace;
8546
8547 if (PyUnicode_READY(item) == -1) {
8548 Py_DECREF(item);
8549 return -1;
8550 }
8551 if (PyUnicode_GET_LENGTH(item) != 1)
8552 goto exit;
8553
8554 replace = PyUnicode_READ_CHAR(item, 0);
8555 if (replace > 127)
8556 goto exit;
8557 translate[ch] = (Py_UCS1)replace;
8558 }
8559 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008560 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008561 goto exit;
8562 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008563 ret = 1;
8564
Benjamin Peterson1365de72014-04-07 20:15:41 -04008565 exit:
8566 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008567 return ret;
8568}
8569
8570/* Fast path for ascii => ascii translation. Return 1 if the whole string
8571 was translated into writer, return 0 if the input string was partially
8572 translated into writer, raise an exception and return -1 on error. */
8573static int
8574unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner872b2912014-04-05 14:27:07 +02008575 _PyUnicodeWriter *writer, int ignore)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008576{
Victor Stinner872b2912014-04-05 14:27:07 +02008577 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008578 Py_ssize_t len;
8579 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008580 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008581
8582 if (PyUnicode_READY(input) == -1)
8583 return -1;
8584 if (!PyUnicode_IS_ASCII(input))
8585 return 0;
8586 len = PyUnicode_GET_LENGTH(input);
8587
Victor Stinner872b2912014-04-05 14:27:07 +02008588 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008589
8590 in = PyUnicode_1BYTE_DATA(input);
8591 end = in + len;
8592
8593 assert(PyUnicode_IS_ASCII(writer->buffer));
8594 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8595 out = PyUnicode_1BYTE_DATA(writer->buffer);
8596
Victor Stinner872b2912014-04-05 14:27:07 +02008597 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008598 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008599 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008600 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008601 int translate = unicode_fast_translate_lookup(mapping, ch,
8602 ascii_table);
8603 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008604 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008605 if (translate == 0)
8606 goto exit;
8607 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008608 }
Victor Stinner872b2912014-04-05 14:27:07 +02008609 if (ch2 == 0xfe) {
8610 if (ignore)
8611 continue;
8612 goto exit;
8613 }
8614 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008615 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008616 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008617 }
Victor Stinner872b2912014-04-05 14:27:07 +02008618 res = 1;
8619
8620exit:
8621 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8622 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008623}
8624
Alexander Belopolsky40018472011-02-26 01:02:56 +00008625PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008626_PyUnicode_TranslateCharmap(PyObject *input,
8627 PyObject *mapping,
8628 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008630 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008631 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008632 Py_ssize_t size, i;
8633 int kind;
8634 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008635 _PyUnicodeWriter writer;
8636 /* error handler */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008637 char *reason = "character maps to <undefined>";
8638 PyObject *errorHandler = NULL;
8639 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008640 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008641 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008642
Guido van Rossumd57fd912000-03-10 22:53:23 +00008643 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008644 PyErr_BadArgument();
8645 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008646 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008648 if (PyUnicode_READY(input) == -1)
8649 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008650 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008651 kind = PyUnicode_KIND(input);
8652 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008653
8654 if (size == 0) {
8655 Py_INCREF(input);
8656 return input;
8657 }
8658
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008659 /* allocate enough for a simple 1:1 translation without
8660 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008661 _PyUnicodeWriter_Init(&writer);
8662 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008663 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008664
Victor Stinner872b2912014-04-05 14:27:07 +02008665 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8666
8667 res = unicode_fast_translate(input, mapping, &writer, ignore);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008668 if (res < 0) {
8669 _PyUnicodeWriter_Dealloc(&writer);
8670 return NULL;
8671 }
8672 if (res == 1)
8673 return _PyUnicodeWriter_Finish(&writer);
8674
Victor Stinner89a76ab2014-04-05 11:44:04 +02008675 i = writer.pos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008676 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008677 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008678 int translate;
8679 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8680 Py_ssize_t newpos;
8681 /* startpos for collecting untranslatable chars */
8682 Py_ssize_t collstart;
8683 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02008684 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685
Victor Stinner1194ea02014-04-04 19:37:40 +02008686 ch = PyUnicode_READ(kind, data, i);
8687 translate = charmaptranslate_output(ch, mapping, &writer);
8688 if (translate < 0)
8689 goto onError;
8690
8691 if (translate != 0) {
8692 /* it worked => adjust input pointer */
8693 ++i;
8694 continue;
8695 }
8696
8697 /* untranslatable character */
8698 collstart = i;
8699 collend = i+1;
8700
8701 /* find all untranslatable characters */
8702 while (collend < size) {
8703 PyObject *x;
8704 ch = PyUnicode_READ(kind, data, collend);
8705 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00008706 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02008707 Py_XDECREF(x);
8708 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008709 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02008710 ++collend;
8711 }
8712
8713 if (ignore) {
8714 i = collend;
8715 }
8716 else {
8717 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8718 reason, input, &exc,
8719 collstart, collend, &newpos);
8720 if (repunicode == NULL)
8721 goto onError;
8722 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008723 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02008724 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008725 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008726 Py_DECREF(repunicode);
8727 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008728 }
8729 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008730 Py_XDECREF(exc);
8731 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02008732 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008733
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02008735 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008736 Py_XDECREF(exc);
8737 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008738 return NULL;
8739}
8740
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008741/* Deprecated. Use PyUnicode_Translate instead. */
8742PyObject *
8743PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8744 Py_ssize_t size,
8745 PyObject *mapping,
8746 const char *errors)
8747{
Christian Heimes5f520f42012-09-11 14:03:25 +02008748 PyObject *result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008749 PyObject *unicode = PyUnicode_FromUnicode(p, size);
8750 if (!unicode)
8751 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02008752 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8753 Py_DECREF(unicode);
8754 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008755}
8756
Alexander Belopolsky40018472011-02-26 01:02:56 +00008757PyObject *
8758PyUnicode_Translate(PyObject *str,
8759 PyObject *mapping,
8760 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008761{
8762 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00008763
Guido van Rossumd57fd912000-03-10 22:53:23 +00008764 str = PyUnicode_FromObject(str);
8765 if (str == NULL)
Christian Heimes5f520f42012-09-11 14:03:25 +02008766 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008767 result = _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008768 Py_DECREF(str);
8769 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008770}
Tim Petersced69f82003-09-16 20:30:58 +00008771
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008772static Py_UCS4
Victor Stinner9310abb2011-10-05 00:59:23 +02008773fix_decimal_and_space_to_ascii(PyObject *self)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008774{
8775 /* No need to call PyUnicode_READY(self) because this function is only
8776 called as a callback from fixup() which does it already. */
8777 const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8778 const int kind = PyUnicode_KIND(self);
8779 void *data = PyUnicode_DATA(self);
Victor Stinnere6abb482012-05-02 01:15:40 +02008780 Py_UCS4 maxchar = 127, ch, fixed;
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008781 int modified = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008782 Py_ssize_t i;
8783
8784 for (i = 0; i < len; ++i) {
8785 ch = PyUnicode_READ(kind, data, i);
8786 fixed = 0;
8787 if (ch > 127) {
8788 if (Py_UNICODE_ISSPACE(ch))
8789 fixed = ' ';
8790 else {
8791 const int decimal = Py_UNICODE_TODECIMAL(ch);
8792 if (decimal >= 0)
8793 fixed = '0' + decimal;
8794 }
8795 if (fixed != 0) {
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008796 modified = 1;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008797 maxchar = Py_MAX(maxchar, fixed);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008798 PyUnicode_WRITE(kind, data, i, fixed);
8799 }
Victor Stinnere6abb482012-05-02 01:15:40 +02008800 else
Benjamin Peterson7e303732013-06-10 09:19:46 -07008801 maxchar = Py_MAX(maxchar, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008802 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008803 }
8804
Benjamin Peterson821e4cf2012-01-12 15:40:18 -05008805 return (modified) ? maxchar : 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008806}
8807
8808PyObject *
8809_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8810{
8811 if (!PyUnicode_Check(unicode)) {
8812 PyErr_BadInternalCall();
8813 return NULL;
8814 }
8815 if (PyUnicode_READY(unicode) == -1)
8816 return NULL;
8817 if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8818 /* If the string is already ASCII, just return the same string */
8819 Py_INCREF(unicode);
8820 return unicode;
8821 }
Victor Stinner9310abb2011-10-05 00:59:23 +02008822 return fixup(unicode, fix_decimal_and_space_to_ascii);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008823}
8824
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008825PyObject *
8826PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8827 Py_ssize_t length)
8828{
Victor Stinnerf0124502011-11-21 23:12:56 +01008829 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008830 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01008831 Py_UCS4 maxchar;
8832 enum PyUnicode_Kind kind;
8833 void *data;
8834
Victor Stinner99d7ad02012-02-22 13:37:39 +01008835 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008836 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02008837 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008838 if (ch > 127) {
8839 int decimal = Py_UNICODE_TODECIMAL(ch);
8840 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01008841 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07008842 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008843 }
8844 }
Victor Stinnerf0124502011-11-21 23:12:56 +01008845
8846 /* Copy to a new string */
8847 decimal = PyUnicode_New(length, maxchar);
8848 if (decimal == NULL)
8849 return decimal;
8850 kind = PyUnicode_KIND(decimal);
8851 data = PyUnicode_DATA(decimal);
8852 /* Iterate over code points */
8853 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02008854 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01008855 if (ch > 127) {
8856 int decimal = Py_UNICODE_TODECIMAL(ch);
8857 if (decimal >= 0)
8858 ch = '0' + decimal;
8859 }
8860 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008861 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01008862 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00008863}
Guido van Rossum9e896b32000-04-05 20:11:21 +00008864/* --- Decimal Encoder ---------------------------------------------------- */
8865
Alexander Belopolsky40018472011-02-26 01:02:56 +00008866int
8867PyUnicode_EncodeDecimal(Py_UNICODE *s,
8868 Py_ssize_t length,
8869 char *output,
8870 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00008871{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008872 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01008873 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01008874 enum PyUnicode_Kind kind;
8875 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008876
8877 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008878 PyErr_BadArgument();
8879 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008880 }
8881
Victor Stinner42bf7752011-11-21 22:52:58 +01008882 unicode = PyUnicode_FromUnicode(s, length);
8883 if (unicode == NULL)
8884 return -1;
8885
Benjamin Petersonbac79492012-01-14 13:34:47 -05008886 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner6345be92011-11-25 20:09:01 +01008887 Py_DECREF(unicode);
8888 return -1;
8889 }
Victor Stinner42bf7752011-11-21 22:52:58 +01008890 kind = PyUnicode_KIND(unicode);
8891 data = PyUnicode_DATA(unicode);
8892
Victor Stinnerb84d7232011-11-22 01:50:07 +01008893 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01008894 PyObject *exc;
8895 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00008896 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01008897 Py_ssize_t startpos;
8898
8899 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00008900
Benjamin Peterson29060642009-01-31 22:14:21 +00008901 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008902 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01008903 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008904 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008905 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008906 decimal = Py_UNICODE_TODECIMAL(ch);
8907 if (decimal >= 0) {
8908 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008909 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008910 continue;
8911 }
8912 if (0 < ch && ch < 256) {
8913 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01008914 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00008915 continue;
8916 }
Victor Stinner6345be92011-11-25 20:09:01 +01008917
Victor Stinner42bf7752011-11-21 22:52:58 +01008918 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01008919 exc = NULL;
8920 raise_encode_exception(&exc, "decimal", unicode,
8921 startpos, startpos+1,
8922 "invalid decimal Unicode string");
8923 Py_XDECREF(exc);
8924 Py_DECREF(unicode);
8925 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008926 }
8927 /* 0-terminate the output string */
8928 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01008929 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00008930 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00008931}
8932
Guido van Rossumd57fd912000-03-10 22:53:23 +00008933/* --- Helpers ------------------------------------------------------------ */
8934
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008935/* helper macro to fixup start/end slice values */
8936#define ADJUST_INDICES(start, end, len) \
8937 if (end > len) \
8938 end = len; \
8939 else if (end < 0) { \
8940 end += len; \
8941 if (end < 0) \
8942 end = 0; \
8943 } \
8944 if (start < 0) { \
8945 start += len; \
8946 if (start < 0) \
8947 start = 0; \
8948 }
8949
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008950static Py_ssize_t
Victor Stinner794d5672011-10-10 03:21:36 +02008951any_find_slice(int direction, PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008952 Py_ssize_t start,
8953 Py_ssize_t end)
8954{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008955 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008956 void *buf1, *buf2;
8957 Py_ssize_t len1, len2, result;
8958
8959 kind1 = PyUnicode_KIND(s1);
8960 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008961 if (kind1 < kind2)
8962 return -1;
8963
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008964 len1 = PyUnicode_GET_LENGTH(s1);
8965 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008966 ADJUST_INDICES(start, end, len1);
8967 if (end - start < len2)
8968 return -1;
8969
8970 buf1 = PyUnicode_DATA(s1);
8971 buf2 = PyUnicode_DATA(s2);
8972 if (len2 == 1) {
8973 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
8974 result = findchar((const char *)buf1 + kind1*start,
8975 kind1, end - start, ch, direction);
8976 if (result == -1)
8977 return -1;
8978 else
8979 return start + result;
8980 }
8981
8982 if (kind2 != kind1) {
8983 buf2 = _PyUnicode_AsKind(s2, kind1);
8984 if (!buf2)
8985 return -2;
8986 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008987
Victor Stinner794d5672011-10-10 03:21:36 +02008988 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02008989 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02008990 case PyUnicode_1BYTE_KIND:
8991 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8992 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8993 else
8994 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8995 break;
8996 case PyUnicode_2BYTE_KIND:
8997 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8998 break;
8999 case PyUnicode_4BYTE_KIND:
9000 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9001 break;
9002 default:
9003 assert(0); result = -2;
9004 }
9005 }
9006 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009007 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009008 case PyUnicode_1BYTE_KIND:
9009 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9010 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9011 else
9012 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9013 break;
9014 case PyUnicode_2BYTE_KIND:
9015 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9016 break;
9017 case PyUnicode_4BYTE_KIND:
9018 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9019 break;
9020 default:
9021 assert(0); result = -2;
9022 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009023 }
9024
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009025 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009026 PyMem_Free(buf2);
9027
9028 return result;
9029}
9030
9031Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009032_PyUnicode_InsertThousandsGrouping(
9033 PyObject *unicode, Py_ssize_t index,
9034 Py_ssize_t n_buffer,
9035 void *digits, Py_ssize_t n_digits,
9036 Py_ssize_t min_width,
9037 const char *grouping, PyObject *thousands_sep,
9038 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009039{
Victor Stinner41a863c2012-02-24 00:37:51 +01009040 unsigned int kind, thousands_sep_kind;
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009041 char *data, *thousands_sep_data;
Victor Stinner41a863c2012-02-24 00:37:51 +01009042 Py_ssize_t thousands_sep_len;
9043 Py_ssize_t len;
9044
9045 if (unicode != NULL) {
9046 kind = PyUnicode_KIND(unicode);
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009047 data = (char *) PyUnicode_DATA(unicode) + index * kind;
Victor Stinner41a863c2012-02-24 00:37:51 +01009048 }
9049 else {
9050 kind = PyUnicode_1BYTE_KIND;
9051 data = NULL;
9052 }
9053 thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9054 thousands_sep_data = PyUnicode_DATA(thousands_sep);
9055 thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9056 if (unicode != NULL && thousands_sep_kind != kind) {
Victor Stinner90f50d42012-02-24 01:44:47 +01009057 if (thousands_sep_kind < kind) {
9058 thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9059 if (!thousands_sep_data)
9060 return -1;
9061 }
9062 else {
9063 data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9064 if (!data)
9065 return -1;
9066 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009067 }
9068
Benjamin Petersonead6b532011-12-20 17:23:42 -06009069 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009070 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009071 if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
Victor Stinner41a863c2012-02-24 00:37:51 +01009072 len = asciilib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009073 (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009074 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009075 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinnerc3cec782011-10-05 21:24:08 +02009076 else
Victor Stinner41a863c2012-02-24 00:37:51 +01009077 len = ucs1lib_InsertThousandsGrouping(
Victor Stinnerc3cec782011-10-05 21:24:08 +02009078 (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009079 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009080 (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009081 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009082 case PyUnicode_2BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009083 len = ucs2lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009084 (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009085 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009086 (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009087 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009088 case PyUnicode_4BYTE_KIND:
Victor Stinner41a863c2012-02-24 00:37:51 +01009089 len = ucs4lib_InsertThousandsGrouping(
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009090 (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009091 min_width, grouping,
Antoine Pitrou842c0f12012-02-24 13:30:46 +01009092 (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
Victor Stinner41a863c2012-02-24 00:37:51 +01009093 break;
9094 default:
9095 assert(0);
9096 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009097 }
Victor Stinner90f50d42012-02-24 01:44:47 +01009098 if (unicode != NULL && thousands_sep_kind != kind) {
9099 if (thousands_sep_kind < kind)
9100 PyMem_Free(thousands_sep_data);
9101 else
9102 PyMem_Free(data);
9103 }
Victor Stinner41a863c2012-02-24 00:37:51 +01009104 if (unicode == NULL) {
9105 *maxchar = 127;
9106 if (len != n_digits) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009107 *maxchar = Py_MAX(*maxchar,
Victor Stinnere6abb482012-05-02 01:15:40 +02009108 PyUnicode_MAX_CHAR_VALUE(thousands_sep));
Victor Stinner41a863c2012-02-24 00:37:51 +01009109 }
9110 }
9111 return len;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009112}
9113
9114
Alexander Belopolsky40018472011-02-26 01:02:56 +00009115Py_ssize_t
9116PyUnicode_Count(PyObject *str,
9117 PyObject *substr,
9118 Py_ssize_t start,
9119 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009120{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009121 Py_ssize_t result;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009122 PyObject* str_obj;
9123 PyObject* sub_obj;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009124 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009125 void *buf1 = NULL, *buf2 = NULL;
9126 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009127
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009128 str_obj = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009129 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +00009130 return -1;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009131 sub_obj = PyUnicode_FromObject(substr);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009132 if (!sub_obj) {
9133 Py_DECREF(str_obj);
9134 return -1;
9135 }
Benjamin Peterson4c13a4a2012-01-02 09:07:38 -06009136 if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
Benjamin Peterson5e458f52012-01-02 10:12:13 -06009137 Py_DECREF(sub_obj);
Benjamin Peterson29060642009-01-31 22:14:21 +00009138 Py_DECREF(str_obj);
9139 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009140 }
Tim Petersced69f82003-09-16 20:30:58 +00009141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009142 kind1 = PyUnicode_KIND(str_obj);
9143 kind2 = PyUnicode_KIND(sub_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009144 if (kind1 < kind2) {
9145 Py_DECREF(sub_obj);
9146 Py_DECREF(str_obj);
9147 return 0;
Benjamin Peterson1ff2e352012-05-11 17:41:20 -05009148 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009149
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009150 len1 = PyUnicode_GET_LENGTH(str_obj);
9151 len2 = PyUnicode_GET_LENGTH(sub_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009152 ADJUST_INDICES(start, end, len1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009153 if (end - start < len2) {
9154 Py_DECREF(sub_obj);
9155 Py_DECREF(str_obj);
9156 return 0;
9157 }
9158
9159 buf1 = PyUnicode_DATA(str_obj);
9160 buf2 = PyUnicode_DATA(sub_obj);
9161 if (kind2 != kind1) {
9162 buf2 = _PyUnicode_AsKind(sub_obj, kind1);
9163 if (!buf2)
9164 goto onError;
9165 }
9166
9167 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009168 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009169 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9170 result = asciilib_count(
9171 ((Py_UCS1*)buf1) + start, end - start,
9172 buf2, len2, PY_SSIZE_T_MAX
9173 );
9174 else
9175 result = ucs1lib_count(
9176 ((Py_UCS1*)buf1) + start, end - start,
9177 buf2, len2, PY_SSIZE_T_MAX
9178 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009179 break;
9180 case PyUnicode_2BYTE_KIND:
9181 result = ucs2lib_count(
9182 ((Py_UCS2*)buf1) + start, end - start,
9183 buf2, len2, PY_SSIZE_T_MAX
9184 );
9185 break;
9186 case PyUnicode_4BYTE_KIND:
9187 result = ucs4lib_count(
9188 ((Py_UCS4*)buf1) + start, end - start,
9189 buf2, len2, PY_SSIZE_T_MAX
9190 );
9191 break;
9192 default:
9193 assert(0); result = 0;
9194 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009195
9196 Py_DECREF(sub_obj);
9197 Py_DECREF(str_obj);
9198
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009199 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009200 PyMem_Free(buf2);
9201
Guido van Rossumd57fd912000-03-10 22:53:23 +00009202 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009203 onError:
9204 Py_DECREF(sub_obj);
9205 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009206 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009207 PyMem_Free(buf2);
9208 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009209}
9210
Alexander Belopolsky40018472011-02-26 01:02:56 +00009211Py_ssize_t
9212PyUnicode_Find(PyObject *str,
9213 PyObject *sub,
9214 Py_ssize_t start,
9215 Py_ssize_t end,
9216 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009217{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009218 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009219
Guido van Rossumd57fd912000-03-10 22:53:23 +00009220 str = PyUnicode_FromObject(str);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009221 if (!str)
Benjamin Peterson29060642009-01-31 22:14:21 +00009222 return -2;
Thomas Wouters477c8d52006-05-27 19:21:47 +00009223 sub = PyUnicode_FromObject(sub);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009224 if (!sub) {
9225 Py_DECREF(str);
9226 return -2;
9227 }
9228 if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9229 Py_DECREF(sub);
Benjamin Peterson29060642009-01-31 22:14:21 +00009230 Py_DECREF(str);
9231 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009232 }
Tim Petersced69f82003-09-16 20:30:58 +00009233
Victor Stinner794d5672011-10-10 03:21:36 +02009234 result = any_find_slice(direction,
9235 str, sub, start, end
9236 );
Thomas Wouters477c8d52006-05-27 19:21:47 +00009237
Guido van Rossumd57fd912000-03-10 22:53:23 +00009238 Py_DECREF(str);
Thomas Wouters477c8d52006-05-27 19:21:47 +00009239 Py_DECREF(sub);
9240
Guido van Rossumd57fd912000-03-10 22:53:23 +00009241 return result;
9242}
9243
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009244Py_ssize_t
9245PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9246 Py_ssize_t start, Py_ssize_t end,
9247 int direction)
9248{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009249 int kind;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009250 Py_ssize_t result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009251 if (PyUnicode_READY(str) == -1)
9252 return -2;
Victor Stinner267aa242011-10-02 01:08:37 +02009253 if (start < 0 || end < 0) {
9254 PyErr_SetString(PyExc_IndexError, "string index out of range");
9255 return -2;
9256 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009257 if (end > PyUnicode_GET_LENGTH(str))
9258 end = PyUnicode_GET_LENGTH(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009259 if (start >= end)
9260 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009261 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009262 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9263 kind, end-start, ch, direction);
9264 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009265 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009266 else
9267 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009268}
9269
Alexander Belopolsky40018472011-02-26 01:02:56 +00009270static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009271tailmatch(PyObject *self,
9272 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009273 Py_ssize_t start,
9274 Py_ssize_t end,
9275 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009276{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009277 int kind_self;
9278 int kind_sub;
9279 void *data_self;
9280 void *data_sub;
9281 Py_ssize_t offset;
9282 Py_ssize_t i;
9283 Py_ssize_t end_sub;
9284
9285 if (PyUnicode_READY(self) == -1 ||
9286 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009287 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009288
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009289 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9290 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009291 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009292 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009293
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009294 if (PyUnicode_GET_LENGTH(substring) == 0)
9295 return 1;
9296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009297 kind_self = PyUnicode_KIND(self);
9298 data_self = PyUnicode_DATA(self);
9299 kind_sub = PyUnicode_KIND(substring);
9300 data_sub = PyUnicode_DATA(substring);
9301 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9302
9303 if (direction > 0)
9304 offset = end;
9305 else
9306 offset = start;
9307
9308 if (PyUnicode_READ(kind_self, data_self, offset) ==
9309 PyUnicode_READ(kind_sub, data_sub, 0) &&
9310 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9311 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9312 /* If both are of the same kind, memcmp is sufficient */
9313 if (kind_self == kind_sub) {
9314 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009315 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009316 data_sub,
9317 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009318 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009319 }
9320 /* otherwise we have to compare each character by first accesing it */
9321 else {
9322 /* We do not need to compare 0 and len(substring)-1 because
9323 the if statement above ensured already that they are equal
9324 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009325 for (i = 1; i < end_sub; ++i) {
9326 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9327 PyUnicode_READ(kind_sub, data_sub, i))
9328 return 0;
9329 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009330 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009331 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009332 }
9333
9334 return 0;
9335}
9336
Alexander Belopolsky40018472011-02-26 01:02:56 +00009337Py_ssize_t
9338PyUnicode_Tailmatch(PyObject *str,
9339 PyObject *substr,
9340 Py_ssize_t start,
9341 Py_ssize_t end,
9342 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009343{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009344 Py_ssize_t result;
Tim Petersced69f82003-09-16 20:30:58 +00009345
Guido van Rossumd57fd912000-03-10 22:53:23 +00009346 str = PyUnicode_FromObject(str);
9347 if (str == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009348 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009349 substr = PyUnicode_FromObject(substr);
9350 if (substr == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009351 Py_DECREF(str);
9352 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009353 }
Tim Petersced69f82003-09-16 20:30:58 +00009354
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009355 result = tailmatch(str, substr,
Benjamin Peterson29060642009-01-31 22:14:21 +00009356 start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009357 Py_DECREF(str);
9358 Py_DECREF(substr);
9359 return result;
9360}
9361
Guido van Rossumd57fd912000-03-10 22:53:23 +00009362/* Apply fixfct filter to the Unicode object self and return a
9363 reference to the modified object */
9364
Alexander Belopolsky40018472011-02-26 01:02:56 +00009365static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +02009366fixup(PyObject *self,
9367 Py_UCS4 (*fixfct)(PyObject *s))
Guido van Rossumd57fd912000-03-10 22:53:23 +00009368{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009369 PyObject *u;
9370 Py_UCS4 maxchar_old, maxchar_new = 0;
Victor Stinnereaab6042011-12-11 22:22:39 +01009371 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009372
Victor Stinnerbf6e5602011-12-12 01:53:47 +01009373 u = _PyUnicode_Copy(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009374 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009375 return NULL;
Victor Stinner87af4f22011-11-21 23:03:47 +01009376 maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00009377
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009378 /* fix functions return the new maximum character in a string,
9379 if the kind of the resulting unicode object does not change,
9380 everything is fine. Otherwise we need to change the string kind
9381 and re-run the fix function. */
Victor Stinner9310abb2011-10-05 00:59:23 +02009382 maxchar_new = fixfct(u);
Victor Stinnereaab6042011-12-11 22:22:39 +01009383
9384 if (maxchar_new == 0) {
9385 /* no changes */;
9386 if (PyUnicode_CheckExact(self)) {
9387 Py_DECREF(u);
9388 Py_INCREF(self);
9389 return self;
9390 }
9391 else
9392 return u;
9393 }
9394
Victor Stinnere6abb482012-05-02 01:15:40 +02009395 maxchar_new = align_maxchar(maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009396
Victor Stinnereaab6042011-12-11 22:22:39 +01009397 if (maxchar_new == maxchar_old)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009398 return u;
Victor Stinnereaab6042011-12-11 22:22:39 +01009399
9400 /* In case the maximum character changed, we need to
9401 convert the string to the new category. */
9402 v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9403 if (v == NULL) {
9404 Py_DECREF(u);
9405 return NULL;
9406 }
9407 if (maxchar_new > maxchar_old) {
9408 /* If the maxchar increased so that the kind changed, not all
9409 characters are representable anymore and we need to fix the
9410 string again. This only happens in very few cases. */
Victor Stinnerd3f08822012-05-29 12:57:52 +02009411 _PyUnicode_FastCopyCharacters(v, 0,
9412 self, 0, PyUnicode_GET_LENGTH(self));
Victor Stinnereaab6042011-12-11 22:22:39 +01009413 maxchar_old = fixfct(v);
9414 assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009415 }
9416 else {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009417 _PyUnicode_FastCopyCharacters(v, 0,
9418 u, 0, PyUnicode_GET_LENGTH(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009419 }
Victor Stinnereaab6042011-12-11 22:22:39 +01009420 Py_DECREF(u);
9421 assert(_PyUnicode_CheckConsistency(v, 1));
9422 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009423}
9424
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009425static PyObject *
9426ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009427{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009428 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9429 char *resdata, *data = PyUnicode_DATA(self);
9430 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009431
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009432 res = PyUnicode_New(len, 127);
9433 if (res == NULL)
9434 return NULL;
9435 resdata = PyUnicode_DATA(res);
9436 if (lower)
9437 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009438 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009439 _Py_bytes_upper(resdata, data, len);
9440 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009441}
9442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009443static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009444handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009445{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009446 Py_ssize_t j;
9447 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009448 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009449 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009450
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009451 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9452
9453 where ! is a negation and \p{xxx} is a character with property xxx.
9454 */
9455 for (j = i - 1; j >= 0; j--) {
9456 c = PyUnicode_READ(kind, data, j);
9457 if (!_PyUnicode_IsCaseIgnorable(c))
9458 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009459 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009460 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9461 if (final_sigma) {
9462 for (j = i + 1; j < length; j++) {
9463 c = PyUnicode_READ(kind, data, j);
9464 if (!_PyUnicode_IsCaseIgnorable(c))
9465 break;
9466 }
9467 final_sigma = j == length || !_PyUnicode_IsCased(c);
9468 }
9469 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009470}
9471
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009472static int
9473lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9474 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009475{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009476 /* Obscure special case. */
9477 if (c == 0x3A3) {
9478 mapped[0] = handle_capital_sigma(kind, data, length, i);
9479 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009480 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009481 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009482}
9483
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009484static Py_ssize_t
9485do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009486{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009487 Py_ssize_t i, k = 0;
9488 int n_res, j;
9489 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009490
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009491 c = PyUnicode_READ(kind, data, 0);
9492 n_res = _PyUnicode_ToUpperFull(c, mapped);
9493 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009494 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009495 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009496 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009497 for (i = 1; i < length; i++) {
9498 c = PyUnicode_READ(kind, data, i);
9499 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9500 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009501 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009502 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009503 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009504 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009505 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009506}
9507
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009508static Py_ssize_t
9509do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9510 Py_ssize_t i, k = 0;
9511
9512 for (i = 0; i < length; i++) {
9513 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9514 int n_res, j;
9515 if (Py_UNICODE_ISUPPER(c)) {
9516 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9517 }
9518 else if (Py_UNICODE_ISLOWER(c)) {
9519 n_res = _PyUnicode_ToUpperFull(c, mapped);
9520 }
9521 else {
9522 n_res = 1;
9523 mapped[0] = c;
9524 }
9525 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009526 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009527 res[k++] = mapped[j];
9528 }
9529 }
9530 return k;
9531}
9532
9533static Py_ssize_t
9534do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9535 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009536{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009537 Py_ssize_t i, k = 0;
9538
9539 for (i = 0; i < length; i++) {
9540 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9541 int n_res, j;
9542 if (lower)
9543 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9544 else
9545 n_res = _PyUnicode_ToUpperFull(c, mapped);
9546 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009547 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009548 res[k++] = mapped[j];
9549 }
9550 }
9551 return k;
9552}
9553
9554static Py_ssize_t
9555do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9556{
9557 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9558}
9559
9560static Py_ssize_t
9561do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9562{
9563 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9564}
9565
Benjamin Petersone51757f2012-01-12 21:10:29 -05009566static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009567do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9568{
9569 Py_ssize_t i, k = 0;
9570
9571 for (i = 0; i < length; i++) {
9572 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9573 Py_UCS4 mapped[3];
9574 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9575 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009576 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009577 res[k++] = mapped[j];
9578 }
9579 }
9580 return k;
9581}
9582
9583static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009584do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9585{
9586 Py_ssize_t i, k = 0;
9587 int previous_is_cased;
9588
9589 previous_is_cased = 0;
9590 for (i = 0; i < length; i++) {
9591 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9592 Py_UCS4 mapped[3];
9593 int n_res, j;
9594
9595 if (previous_is_cased)
9596 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9597 else
9598 n_res = _PyUnicode_ToTitleFull(c, mapped);
9599
9600 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009601 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009602 res[k++] = mapped[j];
9603 }
9604
9605 previous_is_cased = _PyUnicode_IsCased(c);
9606 }
9607 return k;
9608}
9609
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009610static PyObject *
9611case_operation(PyObject *self,
9612 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9613{
9614 PyObject *res = NULL;
9615 Py_ssize_t length, newlength = 0;
9616 int kind, outkind;
9617 void *data, *outdata;
9618 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9619
Benjamin Petersoneea48462012-01-16 14:28:50 -05009620 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009621
9622 kind = PyUnicode_KIND(self);
9623 data = PyUnicode_DATA(self);
9624 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009625 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009626 PyErr_SetString(PyExc_OverflowError, "string is too long");
9627 return NULL;
9628 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009629 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009630 if (tmp == NULL)
9631 return PyErr_NoMemory();
9632 newlength = perform(kind, data, length, tmp, &maxchar);
9633 res = PyUnicode_New(newlength, maxchar);
9634 if (res == NULL)
9635 goto leave;
9636 tmpend = tmp + newlength;
9637 outdata = PyUnicode_DATA(res);
9638 outkind = PyUnicode_KIND(res);
9639 switch (outkind) {
9640 case PyUnicode_1BYTE_KIND:
9641 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9642 break;
9643 case PyUnicode_2BYTE_KIND:
9644 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9645 break;
9646 case PyUnicode_4BYTE_KIND:
9647 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9648 break;
9649 default:
9650 assert(0);
9651 break;
9652 }
9653 leave:
9654 PyMem_FREE(tmp);
9655 return res;
9656}
9657
Tim Peters8ce9f162004-08-27 01:49:32 +00009658PyObject *
9659PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009660{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009661 PyObject *sep = NULL;
Victor Stinnerdd077322011-10-07 17:02:31 +02009662 Py_ssize_t seplen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009663 PyObject *res = NULL; /* the result */
Tim Peters05eba1f2004-08-27 21:32:02 +00009664 PyObject *fseq; /* PySequence_Fast(seq) */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009665 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
9666 PyObject **items;
Tim Peters8ce9f162004-08-27 01:49:32 +00009667 PyObject *item;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009668 Py_ssize_t sz, i, res_offset;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009669 Py_UCS4 maxchar;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009670 Py_UCS4 item_maxchar;
Victor Stinnerdd077322011-10-07 17:02:31 +02009671 int use_memcpy;
9672 unsigned char *res_data = NULL, *sep_data = NULL;
9673 PyObject *last_obj;
9674 unsigned int kind = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009675
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009676 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009677 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009678 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009679 }
9680
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009681 /* NOTE: the following code can't call back into Python code,
9682 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009683 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009684
Tim Peters05eba1f2004-08-27 21:32:02 +00009685 seqlen = PySequence_Fast_GET_SIZE(fseq);
9686 /* If empty sequence, return u"". */
9687 if (seqlen == 0) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009688 Py_DECREF(fseq);
Serhiy Storchaka678db842013-01-26 12:16:36 +02009689 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009690 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009691
Tim Peters05eba1f2004-08-27 21:32:02 +00009692 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009693 last_obj = NULL;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009694 items = PySequence_Fast_ITEMS(fseq);
Victor Stinneracf47b82011-10-06 12:32:37 +02009695 if (seqlen == 1) {
9696 if (PyUnicode_CheckExact(items[0])) {
9697 res = items[0];
9698 Py_INCREF(res);
9699 Py_DECREF(fseq);
9700 return res;
9701 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009702 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009703 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009704 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009705 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009706 /* Set up sep and seplen */
9707 if (separator == NULL) {
9708 /* fall back to a blank space separator */
9709 sep = PyUnicode_FromOrdinal(' ');
9710 if (!sep)
9711 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009712 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009713 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009714 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009715 else {
9716 if (!PyUnicode_Check(separator)) {
9717 PyErr_Format(PyExc_TypeError,
9718 "separator: expected str instance,"
9719 " %.80s found",
9720 Py_TYPE(separator)->tp_name);
9721 goto onError;
9722 }
9723 if (PyUnicode_READY(separator))
9724 goto onError;
9725 sep = separator;
9726 seplen = PyUnicode_GET_LENGTH(separator);
9727 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9728 /* inc refcount to keep this code path symmetric with the
9729 above case of a blank separator */
9730 Py_INCREF(sep);
9731 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009732 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009733 }
9734
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009735 /* There are at least two things to join, or else we have a subclass
9736 * of str in the sequence.
9737 * Do a pre-pass to figure out the total amount of space we'll
9738 * need (sz), and see whether all argument are strings.
9739 */
9740 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +02009741#ifdef Py_DEBUG
9742 use_memcpy = 0;
9743#else
9744 use_memcpy = 1;
9745#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009746 for (i = 0; i < seqlen; i++) {
9747 const Py_ssize_t old_sz = sz;
9748 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +00009749 if (!PyUnicode_Check(item)) {
9750 PyErr_Format(PyExc_TypeError,
Victor Stinnera33bce02014-07-04 22:47:46 +02009751 "sequence item %zd: expected str instance,"
Benjamin Peterson29060642009-01-31 22:14:21 +00009752 " %.80s found",
9753 i, Py_TYPE(item)->tp_name);
9754 goto onError;
9755 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009756 if (PyUnicode_READY(item) == -1)
9757 goto onError;
9758 sz += PyUnicode_GET_LENGTH(item);
9759 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009760 maxchar = Py_MAX(maxchar, item_maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009761 if (i != 0)
9762 sz += seplen;
9763 if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9764 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +00009765 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009766 goto onError;
9767 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009768 if (use_memcpy && last_obj != NULL) {
9769 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9770 use_memcpy = 0;
9771 }
9772 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009773 }
Tim Petersced69f82003-09-16 20:30:58 +00009774
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009775 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009776 if (res == NULL)
9777 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +00009778
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009779 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009780#ifdef Py_DEBUG
9781 use_memcpy = 0;
9782#else
9783 if (use_memcpy) {
9784 res_data = PyUnicode_1BYTE_DATA(res);
9785 kind = PyUnicode_KIND(res);
9786 if (seplen != 0)
9787 sep_data = PyUnicode_1BYTE_DATA(sep);
9788 }
9789#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +02009790 if (use_memcpy) {
9791 for (i = 0; i < seqlen; ++i) {
9792 Py_ssize_t itemlen;
9793 item = items[i];
9794
9795 /* Copy item, and maybe the separator. */
9796 if (i && seplen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009797 Py_MEMCPY(res_data,
9798 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009799 kind * seplen);
9800 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009801 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009802
9803 itemlen = PyUnicode_GET_LENGTH(item);
9804 if (itemlen != 0) {
Victor Stinnerdd077322011-10-07 17:02:31 +02009805 Py_MEMCPY(res_data,
9806 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009807 kind * itemlen);
9808 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +02009809 }
Victor Stinner4560f9c2013-04-14 18:56:46 +02009810 }
9811 assert(res_data == PyUnicode_1BYTE_DATA(res)
9812 + kind * PyUnicode_GET_LENGTH(res));
9813 }
9814 else {
9815 for (i = 0, res_offset = 0; i < seqlen; ++i) {
9816 Py_ssize_t itemlen;
9817 item = items[i];
9818
9819 /* Copy item, and maybe the separator. */
9820 if (i && seplen != 0) {
9821 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9822 res_offset += seplen;
9823 }
9824
9825 itemlen = PyUnicode_GET_LENGTH(item);
9826 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02009827 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +02009828 res_offset += itemlen;
9829 }
Victor Stinner9ce5a832011-10-03 23:36:02 +02009830 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009831 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +02009832 }
Tim Peters8ce9f162004-08-27 01:49:32 +00009833
Tim Peters05eba1f2004-08-27 21:32:02 +00009834 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009835 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009836 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009837 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009838
Benjamin Peterson29060642009-01-31 22:14:21 +00009839 onError:
Tim Peters05eba1f2004-08-27 21:32:02 +00009840 Py_DECREF(fseq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009841 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +00009842 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009843 return NULL;
9844}
9845
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009846#define FILL(kind, data, value, start, length) \
9847 do { \
9848 Py_ssize_t i_ = 0; \
9849 assert(kind != PyUnicode_WCHAR_KIND); \
9850 switch ((kind)) { \
9851 case PyUnicode_1BYTE_KIND: { \
9852 unsigned char * to_ = (unsigned char *)((data)) + (start); \
Victor Stinnerf2c76aa2012-05-03 13:10:40 +02009853 memset(to_, (unsigned char)value, (length)); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009854 break; \
9855 } \
9856 case PyUnicode_2BYTE_KIND: { \
9857 Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9858 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9859 break; \
9860 } \
Benjamin Petersone157cf12012-01-01 15:56:20 -06009861 case PyUnicode_4BYTE_KIND: { \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009862 Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9863 for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9864 break; \
9865 } \
Serhiy Storchaka133b11b2014-12-01 18:56:28 +02009866 default: assert(0); \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867 } \
9868 } while (0)
9869
Victor Stinnerd3f08822012-05-29 12:57:52 +02009870void
9871_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9872 Py_UCS4 fill_char)
9873{
9874 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9875 const void *data = PyUnicode_DATA(unicode);
9876 assert(PyUnicode_IS_READY(unicode));
9877 assert(unicode_modifiable(unicode));
9878 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9879 assert(start >= 0);
9880 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9881 FILL(kind, data, fill_char, start, length);
9882}
9883
Victor Stinner3fe55312012-01-04 00:33:50 +01009884Py_ssize_t
9885PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9886 Py_UCS4 fill_char)
9887{
9888 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +01009889
9890 if (!PyUnicode_Check(unicode)) {
9891 PyErr_BadInternalCall();
9892 return -1;
9893 }
9894 if (PyUnicode_READY(unicode) == -1)
9895 return -1;
9896 if (unicode_check_modifiable(unicode))
9897 return -1;
9898
Victor Stinnerd3f08822012-05-29 12:57:52 +02009899 if (start < 0) {
9900 PyErr_SetString(PyExc_IndexError, "string index out of range");
9901 return -1;
9902 }
Victor Stinner3fe55312012-01-04 00:33:50 +01009903 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9904 PyErr_SetString(PyExc_ValueError,
9905 "fill character is bigger than "
9906 "the string maximum character");
9907 return -1;
9908 }
9909
9910 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9911 length = Py_MIN(maxlen, length);
9912 if (length <= 0)
9913 return 0;
9914
Victor Stinnerd3f08822012-05-29 12:57:52 +02009915 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +01009916 return length;
9917}
9918
Victor Stinner9310abb2011-10-05 00:59:23 +02009919static PyObject *
9920pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009921 Py_ssize_t left,
9922 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009923 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009924{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009925 PyObject *u;
9926 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009927 int kind;
9928 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009929
9930 if (left < 0)
9931 left = 0;
9932 if (right < 0)
9933 right = 0;
9934
Victor Stinnerc4b49542011-12-11 22:44:26 +01009935 if (left == 0 && right == 0)
9936 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009938 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9939 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00009940 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9941 return NULL;
9942 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009943 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -07009944 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009945 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +02009946 if (!u)
9947 return NULL;
9948
9949 kind = PyUnicode_KIND(u);
9950 data = PyUnicode_DATA(u);
9951 if (left)
9952 FILL(kind, data, fill, 0, left);
9953 if (right)
9954 FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +02009955 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02009956 assert(_PyUnicode_CheckConsistency(u, 1));
9957 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009958}
9959
Alexander Belopolsky40018472011-02-26 01:02:56 +00009960PyObject *
9961PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009962{
Guido van Rossumd57fd912000-03-10 22:53:23 +00009963 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009964
9965 string = PyUnicode_FromObject(string);
Benjamin Peterson22a29702012-01-02 09:00:30 -06009966 if (string == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009967 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -06009968 if (PyUnicode_READY(string) == -1) {
9969 Py_DECREF(string);
9970 return NULL;
9971 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009972
Benjamin Petersonead6b532011-12-20 17:23:42 -06009973 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009974 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +02009975 if (PyUnicode_IS_ASCII(string))
9976 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009977 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009978 PyUnicode_GET_LENGTH(string), keepends);
9979 else
9980 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009981 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +02009982 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009983 break;
9984 case PyUnicode_2BYTE_KIND:
9985 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009986 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009987 PyUnicode_GET_LENGTH(string), keepends);
9988 break;
9989 case PyUnicode_4BYTE_KIND:
9990 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +01009991 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009992 PyUnicode_GET_LENGTH(string), keepends);
9993 break;
9994 default:
9995 assert(0);
9996 list = 0;
9997 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009998 Py_DECREF(string);
9999 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010000}
10001
Alexander Belopolsky40018472011-02-26 01:02:56 +000010002static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010003split(PyObject *self,
10004 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010005 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010006{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010007 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010008 void *buf1, *buf2;
10009 Py_ssize_t len1, len2;
10010 PyObject* out;
10011
Guido van Rossumd57fd912000-03-10 22:53:23 +000010012 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010013 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010014
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010015 if (PyUnicode_READY(self) == -1)
10016 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010017
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010018 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010019 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010020 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010021 if (PyUnicode_IS_ASCII(self))
10022 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010023 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010024 PyUnicode_GET_LENGTH(self), maxcount
10025 );
10026 else
10027 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010028 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010029 PyUnicode_GET_LENGTH(self), maxcount
10030 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010031 case PyUnicode_2BYTE_KIND:
10032 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010033 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010034 PyUnicode_GET_LENGTH(self), maxcount
10035 );
10036 case PyUnicode_4BYTE_KIND:
10037 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010038 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010039 PyUnicode_GET_LENGTH(self), maxcount
10040 );
10041 default:
10042 assert(0);
10043 return NULL;
10044 }
10045
10046 if (PyUnicode_READY(substring) == -1)
10047 return NULL;
10048
10049 kind1 = PyUnicode_KIND(self);
10050 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010051 len1 = PyUnicode_GET_LENGTH(self);
10052 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010053 if (kind1 < kind2 || len1 < len2) {
10054 out = PyList_New(1);
10055 if (out == NULL)
10056 return NULL;
10057 Py_INCREF(self);
10058 PyList_SET_ITEM(out, 0, self);
10059 return out;
10060 }
10061 buf1 = PyUnicode_DATA(self);
10062 buf2 = PyUnicode_DATA(substring);
10063 if (kind2 != kind1) {
10064 buf2 = _PyUnicode_AsKind(substring, kind1);
10065 if (!buf2)
10066 return NULL;
10067 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010068
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010069 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010070 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010071 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10072 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010073 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010074 else
10075 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010076 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010077 break;
10078 case PyUnicode_2BYTE_KIND:
10079 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010080 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010081 break;
10082 case PyUnicode_4BYTE_KIND:
10083 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010084 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010085 break;
10086 default:
10087 out = NULL;
10088 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010089 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010090 PyMem_Free(buf2);
10091 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010092}
10093
Alexander Belopolsky40018472011-02-26 01:02:56 +000010094static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010095rsplit(PyObject *self,
10096 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010097 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010098{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010099 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010100 void *buf1, *buf2;
10101 Py_ssize_t len1, len2;
10102 PyObject* out;
10103
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010104 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010105 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010106
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010107 if (PyUnicode_READY(self) == -1)
10108 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010109
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010110 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010111 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010112 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010113 if (PyUnicode_IS_ASCII(self))
10114 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010115 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010116 PyUnicode_GET_LENGTH(self), maxcount
10117 );
10118 else
10119 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010120 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010121 PyUnicode_GET_LENGTH(self), maxcount
10122 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010123 case PyUnicode_2BYTE_KIND:
10124 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010125 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010126 PyUnicode_GET_LENGTH(self), maxcount
10127 );
10128 case PyUnicode_4BYTE_KIND:
10129 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010130 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010131 PyUnicode_GET_LENGTH(self), maxcount
10132 );
10133 default:
10134 assert(0);
10135 return NULL;
10136 }
10137
10138 if (PyUnicode_READY(substring) == -1)
10139 return NULL;
10140
10141 kind1 = PyUnicode_KIND(self);
10142 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 len1 = PyUnicode_GET_LENGTH(self);
10144 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010145 if (kind1 < kind2 || len1 < len2) {
10146 out = PyList_New(1);
10147 if (out == NULL)
10148 return NULL;
10149 Py_INCREF(self);
10150 PyList_SET_ITEM(out, 0, self);
10151 return out;
10152 }
10153 buf1 = PyUnicode_DATA(self);
10154 buf2 = PyUnicode_DATA(substring);
10155 if (kind2 != kind1) {
10156 buf2 = _PyUnicode_AsKind(substring, kind1);
10157 if (!buf2)
10158 return NULL;
10159 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010161 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010162 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010163 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10164 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010165 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010166 else
10167 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010168 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010169 break;
10170 case PyUnicode_2BYTE_KIND:
10171 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010172 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010173 break;
10174 case PyUnicode_4BYTE_KIND:
10175 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010176 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010177 break;
10178 default:
10179 out = NULL;
10180 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010181 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 PyMem_Free(buf2);
10183 return out;
10184}
10185
10186static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010187anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10188 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010190 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010191 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010192 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10193 return asciilib_find(buf1, len1, buf2, len2, offset);
10194 else
10195 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010196 case PyUnicode_2BYTE_KIND:
10197 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10198 case PyUnicode_4BYTE_KIND:
10199 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10200 }
10201 assert(0);
10202 return -1;
10203}
10204
10205static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010206anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10207 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010208{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010209 switch (kind) {
10210 case PyUnicode_1BYTE_KIND:
10211 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10212 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10213 else
10214 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10215 case PyUnicode_2BYTE_KIND:
10216 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10217 case PyUnicode_4BYTE_KIND:
10218 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10219 }
10220 assert(0);
10221 return 0;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010222}
10223
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010224static void
10225replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10226 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10227{
10228 int kind = PyUnicode_KIND(u);
10229 void *data = PyUnicode_DATA(u);
10230 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10231 if (kind == PyUnicode_1BYTE_KIND) {
10232 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10233 (Py_UCS1 *)data + len,
10234 u1, u2, maxcount);
10235 }
10236 else if (kind == PyUnicode_2BYTE_KIND) {
10237 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10238 (Py_UCS2 *)data + len,
10239 u1, u2, maxcount);
10240 }
10241 else {
10242 assert(kind == PyUnicode_4BYTE_KIND);
10243 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10244 (Py_UCS4 *)data + len,
10245 u1, u2, maxcount);
10246 }
10247}
10248
Alexander Belopolsky40018472011-02-26 01:02:56 +000010249static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250replace(PyObject *self, PyObject *str1,
10251 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010252{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 PyObject *u;
10254 char *sbuf = PyUnicode_DATA(self);
10255 char *buf1 = PyUnicode_DATA(str1);
10256 char *buf2 = PyUnicode_DATA(str2);
10257 int srelease = 0, release1 = 0, release2 = 0;
10258 int skind = PyUnicode_KIND(self);
10259 int kind1 = PyUnicode_KIND(str1);
10260 int kind2 = PyUnicode_KIND(str2);
10261 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10262 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10263 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010264 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010265 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010266
10267 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010268 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010270 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010271
Victor Stinner59de0ee2011-10-07 10:01:28 +020010272 if (str1 == str2)
10273 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010274
Victor Stinner49a0a212011-10-12 23:46:10 +020010275 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010276 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10277 if (maxchar < maxchar_str1)
10278 /* substring too wide to be present */
10279 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010280 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10281 /* Replacing str1 with str2 may cause a maxchar reduction in the
10282 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010283 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010284 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010285
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010287 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010289 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010290 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010291 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010292 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010293 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010294
Victor Stinner69ed0f42013-04-09 21:48:24 +020010295 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010296 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010297 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010298 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010299 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010300 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010301 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010302 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010303
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010304 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10305 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010306 }
10307 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010308 int rkind = skind;
10309 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010310 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 if (kind1 < rkind) {
10313 /* widen substring */
10314 buf1 = _PyUnicode_AsKind(str1, rkind);
10315 if (!buf1) goto error;
10316 release1 = 1;
10317 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010318 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010319 if (i < 0)
10320 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010321 if (rkind > kind2) {
10322 /* widen replacement */
10323 buf2 = _PyUnicode_AsKind(str2, rkind);
10324 if (!buf2) goto error;
10325 release2 = 1;
10326 }
10327 else if (rkind < kind2) {
10328 /* widen self and buf1 */
10329 rkind = kind2;
10330 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010331 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010332 sbuf = _PyUnicode_AsKind(self, rkind);
10333 if (!sbuf) goto error;
10334 srelease = 1;
10335 buf1 = _PyUnicode_AsKind(str1, rkind);
10336 if (!buf1) goto error;
10337 release1 = 1;
10338 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010339 u = PyUnicode_New(slen, maxchar);
10340 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010342 assert(PyUnicode_KIND(u) == rkind);
10343 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010344
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010345 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010346 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010347 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010348 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010349 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010350 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010351
10352 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010353 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010354 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010355 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010356 if (i == -1)
10357 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010358 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010360 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010362 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010363 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010364 }
10365 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010367 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368 int rkind = skind;
10369 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010372 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010373 buf1 = _PyUnicode_AsKind(str1, rkind);
10374 if (!buf1) goto error;
10375 release1 = 1;
10376 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010377 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010378 if (n == 0)
10379 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010380 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010381 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 buf2 = _PyUnicode_AsKind(str2, rkind);
10383 if (!buf2) goto error;
10384 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010385 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010386 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010387 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010388 rkind = kind2;
10389 sbuf = _PyUnicode_AsKind(self, rkind);
10390 if (!sbuf) goto error;
10391 srelease = 1;
10392 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010393 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 buf1 = _PyUnicode_AsKind(str1, rkind);
10395 if (!buf1) goto error;
10396 release1 = 1;
10397 }
10398 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10399 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010400 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010401 PyErr_SetString(PyExc_OverflowError,
10402 "replace string is too long");
10403 goto error;
10404 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010405 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010406 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010407 _Py_INCREF_UNICODE_EMPTY();
10408 if (!unicode_empty)
10409 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010410 u = unicode_empty;
10411 goto done;
10412 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010413 if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010414 PyErr_SetString(PyExc_OverflowError,
10415 "replace string is too long");
10416 goto error;
10417 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010418 u = PyUnicode_New(new_size, maxchar);
10419 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010420 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010421 assert(PyUnicode_KIND(u) == rkind);
10422 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010423 ires = i = 0;
10424 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010425 while (n-- > 0) {
10426 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010427 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010428 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010429 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010430 if (j == -1)
10431 break;
10432 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010433 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010434 memcpy(res + rkind * ires,
10435 sbuf + rkind * i,
10436 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010437 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010438 }
10439 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010440 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010441 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010442 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010443 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010444 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010445 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010446 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010447 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010448 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010449 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010450 memcpy(res + rkind * ires,
10451 sbuf + rkind * i,
10452 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010453 }
10454 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010455 /* interleave */
10456 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010457 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010458 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010459 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010460 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010461 if (--n <= 0)
10462 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010463 memcpy(res + rkind * ires,
10464 sbuf + rkind * i,
10465 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010466 ires++;
10467 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010468 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010469 memcpy(res + rkind * ires,
10470 sbuf + rkind * i,
10471 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010472 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010473 }
10474
10475 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010476 unicode_adjust_maxchar(&u);
10477 if (u == NULL)
10478 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010479 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010480
10481 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482 if (srelease)
10483 PyMem_FREE(sbuf);
10484 if (release1)
10485 PyMem_FREE(buf1);
10486 if (release2)
10487 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010488 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010489 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010490
Benjamin Peterson29060642009-01-31 22:14:21 +000010491 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010492 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010493 if (srelease)
10494 PyMem_FREE(sbuf);
10495 if (release1)
10496 PyMem_FREE(buf1);
10497 if (release2)
10498 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010499 return unicode_result_unchanged(self);
10500
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501 error:
10502 if (srelease && sbuf)
10503 PyMem_FREE(sbuf);
10504 if (release1 && buf1)
10505 PyMem_FREE(buf1);
10506 if (release2 && buf2)
10507 PyMem_FREE(buf2);
10508 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010509}
10510
10511/* --- Unicode Object Methods --------------------------------------------- */
10512
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010513PyDoc_STRVAR(title__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010514 "S.title() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010515\n\
10516Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010517characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010518
10519static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010520unicode_title(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010521{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010522 if (PyUnicode_READY(self) == -1)
10523 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010524 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010525}
10526
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010527PyDoc_STRVAR(capitalize__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010528 "S.capitalize() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010529\n\
10530Return a capitalized version of S, i.e. make the first character\n\
Senthil Kumarane51ee8a2010-07-05 12:00:56 +000010531have upper case and the rest lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010532
10533static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020010534unicode_capitalize(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010535{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010536 if (PyUnicode_READY(self) == -1)
10537 return NULL;
10538 if (PyUnicode_GET_LENGTH(self) == 0)
10539 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010540 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010541}
10542
Benjamin Petersond5890c82012-01-14 13:23:30 -050010543PyDoc_STRVAR(casefold__doc__,
10544 "S.casefold() -> str\n\
10545\n\
10546Return a version of S suitable for caseless comparisons.");
10547
10548static PyObject *
10549unicode_casefold(PyObject *self)
10550{
10551 if (PyUnicode_READY(self) == -1)
10552 return NULL;
10553 if (PyUnicode_IS_ASCII(self))
10554 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010555 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010556}
10557
10558
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010559/* Argument converter. Coerces to a single unicode character */
10560
10561static int
10562convert_uc(PyObject *obj, void *addr)
10563{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010564 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010565 PyObject *uniobj;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010566
Benjamin Peterson14339b62009-01-31 16:36:08 +000010567 uniobj = PyUnicode_FromObject(obj);
10568 if (uniobj == NULL) {
10569 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010570 "The fill character cannot be converted to Unicode");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010571 return 0;
10572 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 if (PyUnicode_GET_LENGTH(uniobj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010574 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010575 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010576 Py_DECREF(uniobj);
10577 return 0;
10578 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010579 *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010580 Py_DECREF(uniobj);
10581 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010582}
10583
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000010584PyDoc_STRVAR(center__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000010585 "S.center(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000010586\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000010587Return S centered in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010588done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +000010589
10590static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010591unicode_center(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010592{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010593 Py_ssize_t marg, left;
10594 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595 Py_UCS4 fillchar = ' ';
10596
Victor Stinnere9a29352011-10-01 02:14:59 +020010597 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010599
Benjamin Petersonbac79492012-01-14 13:34:47 -050010600 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010601 return NULL;
10602
Victor Stinnerc4b49542011-12-11 22:44:26 +010010603 if (PyUnicode_GET_LENGTH(self) >= width)
10604 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010605
Victor Stinnerc4b49542011-12-11 22:44:26 +010010606 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010607 left = marg / 2 + (marg & width & 1);
10608
Victor Stinner9310abb2011-10-05 00:59:23 +020010609 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010610}
10611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612/* This function assumes that str1 and str2 are readied by the caller. */
10613
Marc-André Lemburge5034372000-08-08 08:04:29 +000010614static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010615unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010616{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010617#define COMPARE(TYPE1, TYPE2) \
10618 do { \
10619 TYPE1* p1 = (TYPE1 *)data1; \
10620 TYPE2* p2 = (TYPE2 *)data2; \
10621 TYPE1* end = p1 + len; \
10622 Py_UCS4 c1, c2; \
10623 for (; p1 != end; p1++, p2++) { \
10624 c1 = *p1; \
10625 c2 = *p2; \
10626 if (c1 != c2) \
10627 return (c1 < c2) ? -1 : 1; \
10628 } \
10629 } \
10630 while (0)
10631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010632 int kind1, kind2;
10633 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010634 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010635
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010636 kind1 = PyUnicode_KIND(str1);
10637 kind2 = PyUnicode_KIND(str2);
10638 data1 = PyUnicode_DATA(str1);
10639 data2 = PyUnicode_DATA(str2);
10640 len1 = PyUnicode_GET_LENGTH(str1);
10641 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010642 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010643
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010644 switch(kind1) {
10645 case PyUnicode_1BYTE_KIND:
10646 {
10647 switch(kind2) {
10648 case PyUnicode_1BYTE_KIND:
10649 {
10650 int cmp = memcmp(data1, data2, len);
10651 /* normalize result of memcmp() into the range [-1; 1] */
10652 if (cmp < 0)
10653 return -1;
10654 if (cmp > 0)
10655 return 1;
10656 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010657 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010658 case PyUnicode_2BYTE_KIND:
10659 COMPARE(Py_UCS1, Py_UCS2);
10660 break;
10661 case PyUnicode_4BYTE_KIND:
10662 COMPARE(Py_UCS1, Py_UCS4);
10663 break;
10664 default:
10665 assert(0);
10666 }
10667 break;
10668 }
10669 case PyUnicode_2BYTE_KIND:
10670 {
10671 switch(kind2) {
10672 case PyUnicode_1BYTE_KIND:
10673 COMPARE(Py_UCS2, Py_UCS1);
10674 break;
10675 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010676 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010677 COMPARE(Py_UCS2, Py_UCS2);
10678 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010679 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010680 case PyUnicode_4BYTE_KIND:
10681 COMPARE(Py_UCS2, Py_UCS4);
10682 break;
10683 default:
10684 assert(0);
10685 }
10686 break;
10687 }
10688 case PyUnicode_4BYTE_KIND:
10689 {
10690 switch(kind2) {
10691 case PyUnicode_1BYTE_KIND:
10692 COMPARE(Py_UCS4, Py_UCS1);
10693 break;
10694 case PyUnicode_2BYTE_KIND:
10695 COMPARE(Py_UCS4, Py_UCS2);
10696 break;
10697 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010698 {
10699#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10700 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10701 /* normalize result of wmemcmp() into the range [-1; 1] */
10702 if (cmp < 0)
10703 return -1;
10704 if (cmp > 0)
10705 return 1;
10706#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010707 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010708#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010709 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010710 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010711 default:
10712 assert(0);
10713 }
10714 break;
10715 }
10716 default:
10717 assert(0);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010718 }
10719
Victor Stinner770e19e2012-10-04 22:59:45 +020010720 if (len1 == len2)
10721 return 0;
10722 if (len1 < len2)
10723 return -1;
10724 else
10725 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010726
10727#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010728}
10729
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010730Py_LOCAL(int)
Victor Stinnere5567ad2012-10-23 02:48:49 +020010731unicode_compare_eq(PyObject *str1, PyObject *str2)
10732{
10733 int kind;
10734 void *data1, *data2;
10735 Py_ssize_t len;
10736 int cmp;
10737
Victor Stinnere5567ad2012-10-23 02:48:49 +020010738 len = PyUnicode_GET_LENGTH(str1);
10739 if (PyUnicode_GET_LENGTH(str2) != len)
10740 return 0;
10741 kind = PyUnicode_KIND(str1);
10742 if (PyUnicode_KIND(str2) != kind)
10743 return 0;
10744 data1 = PyUnicode_DATA(str1);
10745 data2 = PyUnicode_DATA(str2);
10746
10747 cmp = memcmp(data1, data2, len * kind);
10748 return (cmp == 0);
10749}
10750
10751
Alexander Belopolsky40018472011-02-26 01:02:56 +000010752int
10753PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010754{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010755 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10756 if (PyUnicode_READY(left) == -1 ||
10757 PyUnicode_READY(right) == -1)
10758 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010759
10760 /* a string is equal to itself */
10761 if (left == right)
10762 return 0;
10763
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010764 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010765 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000010766 PyErr_Format(PyExc_TypeError,
10767 "Can't compare %.100s and %.100s",
10768 left->ob_type->tp_name,
10769 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010770 return -1;
10771}
10772
Martin v. Löwis5b222132007-06-10 09:51:05 +000010773int
Victor Stinnerad14ccd2013-11-07 00:46:04 +010010774_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10775{
10776 PyObject *right_str = _PyUnicode_FromId(right); /* borrowed */
10777 if (right_str == NULL)
10778 return -1;
10779 return PyUnicode_Compare(left, right_str);
10780}
10781
10782int
Martin v. Löwis5b222132007-06-10 09:51:05 +000010783PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10784{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010785 Py_ssize_t i;
10786 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010787 Py_UCS4 chr;
10788
Victor Stinner910337b2011-10-03 03:20:16 +020010789 assert(_PyUnicode_CHECK(uni));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010790 if (PyUnicode_READY(uni) == -1)
10791 return -1;
10792 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010793 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010010794 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010010795 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010010796 size_t len, len2 = strlen(str);
10797 int cmp;
10798
10799 len = Py_MIN(len1, len2);
10800 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010010801 if (cmp != 0) {
10802 if (cmp < 0)
10803 return -1;
10804 else
10805 return 1;
10806 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010010807 if (len1 > len2)
10808 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010809 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010010810 return -1; /* str is longer */
10811 return 0;
10812 }
10813 else {
10814 void *data = PyUnicode_DATA(uni);
10815 /* Compare Unicode string and source character set string */
10816 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020010817 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010010818 return (chr < (unsigned char)(str[i])) ? -1 : 1;
10819 /* This check keeps Python strings that end in '\0' from comparing equal
10820 to C strings identical up to that point. */
10821 if (PyUnicode_GET_LENGTH(uni) != i || chr)
10822 return 1; /* uni is longer */
10823 if (str[i])
10824 return -1; /* str is longer */
10825 return 0;
10826 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000010827}
10828
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010829
Benjamin Peterson29060642009-01-31 22:14:21 +000010830#define TEST_COND(cond) \
Benjamin Peterson14339b62009-01-31 16:36:08 +000010831 ((cond) ? Py_True : Py_False)
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010832
Alexander Belopolsky40018472011-02-26 01:02:56 +000010833PyObject *
10834PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010835{
10836 int result;
Victor Stinnere5567ad2012-10-23 02:48:49 +020010837 PyObject *v;
Benjamin Peterson14339b62009-01-31 16:36:08 +000010838
Victor Stinnere5567ad2012-10-23 02:48:49 +020010839 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10840 Py_RETURN_NOTIMPLEMENTED;
10841
10842 if (PyUnicode_READY(left) == -1 ||
10843 PyUnicode_READY(right) == -1)
10844 return NULL;
10845
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010010846 if (left == right) {
10847 switch (op) {
10848 case Py_EQ:
10849 case Py_LE:
10850 case Py_GE:
10851 /* a string is equal to itself */
10852 v = Py_True;
10853 break;
10854 case Py_NE:
10855 case Py_LT:
10856 case Py_GT:
10857 v = Py_False;
10858 break;
10859 default:
10860 PyErr_BadArgument();
10861 return NULL;
10862 }
10863 }
10864 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020010865 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010010866 result ^= (op == Py_NE);
10867 v = TEST_COND(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020010868 }
10869 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020010870 result = unicode_compare(left, right);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010871
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010872 /* Convert the return value to a Boolean */
10873 switch (op) {
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000010874 case Py_LE:
10875 v = TEST_COND(result <= 0);
10876 break;
10877 case Py_GE:
10878 v = TEST_COND(result >= 0);
10879 break;
10880 case Py_LT:
10881 v = TEST_COND(result == -1);
10882 break;
10883 case Py_GT:
10884 v = TEST_COND(result == 1);
10885 break;
10886 default:
10887 PyErr_BadArgument();
10888 return NULL;
10889 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010890 }
Victor Stinnere5567ad2012-10-23 02:48:49 +020010891 Py_INCREF(v);
10892 return v;
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000010893}
10894
Alexander Belopolsky40018472011-02-26 01:02:56 +000010895int
10896PyUnicode_Contains(PyObject *container, PyObject *element)
Guido van Rossum403d68b2000-03-13 15:55:09 +000010897{
Thomas Wouters477c8d52006-05-27 19:21:47 +000010898 PyObject *str, *sub;
Victor Stinner77282cb2013-04-14 19:22:47 +020010899 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010900 void *buf1, *buf2;
10901 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000010902 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010903
10904 /* Coerce the two arguments */
Thomas Wouters477c8d52006-05-27 19:21:47 +000010905 sub = PyUnicode_FromObject(element);
10906 if (!sub) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010907 PyErr_Format(PyExc_TypeError,
10908 "'in <string>' requires string as left operand, not %s",
10909 element->ob_type->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010910 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010911 }
10912
Thomas Wouters477c8d52006-05-27 19:21:47 +000010913 str = PyUnicode_FromObject(container);
Benjamin Peterson22a29702012-01-02 09:00:30 -060010914 if (!str) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010915 Py_DECREF(sub);
10916 return -1;
10917 }
10918
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010919 kind1 = PyUnicode_KIND(str);
10920 kind2 = PyUnicode_KIND(sub);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010921 if (kind1 < kind2) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010922 Py_DECREF(sub);
Benjamin Peterson1ff2e352012-05-11 17:41:20 -050010923 Py_DECREF(str);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010924 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010925 }
10926 len1 = PyUnicode_GET_LENGTH(str);
10927 len2 = PyUnicode_GET_LENGTH(sub);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010928 if (len1 < len2) {
10929 Py_DECREF(sub);
10930 Py_DECREF(str);
10931 return 0;
10932 }
10933 buf1 = PyUnicode_DATA(str);
10934 buf2 = PyUnicode_DATA(sub);
10935 if (len2 == 1) {
10936 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
10937 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
10938 Py_DECREF(sub);
10939 Py_DECREF(str);
10940 return result;
10941 }
10942 if (kind2 != kind1) {
10943 buf2 = _PyUnicode_AsKind(sub, kind1);
10944 if (!buf2) {
10945 Py_DECREF(sub);
10946 Py_DECREF(str);
10947 return -1;
10948 }
10949 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010950
Victor Stinner77282cb2013-04-14 19:22:47 +020010951 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010952 case PyUnicode_1BYTE_KIND:
10953 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10954 break;
10955 case PyUnicode_2BYTE_KIND:
10956 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10957 break;
10958 case PyUnicode_4BYTE_KIND:
10959 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10960 break;
10961 default:
10962 result = -1;
10963 assert(0);
10964 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000010965
10966 Py_DECREF(str);
10967 Py_DECREF(sub);
10968
Victor Stinner77282cb2013-04-14 19:22:47 +020010969 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010970 PyMem_Free(buf2);
10971
Guido van Rossum403d68b2000-03-13 15:55:09 +000010972 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000010973}
10974
Guido van Rossumd57fd912000-03-10 22:53:23 +000010975/* Concat to string or Unicode object giving a new Unicode object. */
10976
Alexander Belopolsky40018472011-02-26 01:02:56 +000010977PyObject *
10978PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010979{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010980 PyObject *u = NULL, *v = NULL, *w;
Victor Stinner127226b2011-10-13 01:12:34 +020010981 Py_UCS4 maxchar, maxchar2;
Victor Stinner488fa492011-12-12 00:01:39 +010010982 Py_ssize_t u_len, v_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010983
10984 /* Coerce the two arguments */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010985 u = PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010986 if (u == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010987 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010988 v = PyUnicode_FromObject(right);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010989 if (v == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000010990 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010991
10992 /* Shortcuts */
Victor Stinnera464fc12011-10-02 20:39:30 +020010993 if (v == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010994 Py_DECREF(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010995 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010996 }
Victor Stinnera464fc12011-10-02 20:39:30 +020010997 if (u == unicode_empty) {
Benjamin Peterson29060642009-01-31 22:14:21 +000010998 Py_DECREF(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010999 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011000 }
11001
Victor Stinner488fa492011-12-12 00:01:39 +010011002 u_len = PyUnicode_GET_LENGTH(u);
11003 v_len = PyUnicode_GET_LENGTH(v);
11004 if (u_len > PY_SSIZE_T_MAX - v_len) {
11005 PyErr_SetString(PyExc_OverflowError,
11006 "strings are too large to concat");
11007 goto onError;
11008 }
11009 new_len = u_len + v_len;
11010
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011011 maxchar = PyUnicode_MAX_CHAR_VALUE(u);
Victor Stinner127226b2011-10-13 01:12:34 +020011012 maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011013 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011014
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015 /* Concat the two Unicode strings */
Victor Stinner488fa492011-12-12 00:01:39 +010011016 w = PyUnicode_New(new_len, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011017 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000011018 goto onError;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011019 _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
11020 _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021 Py_DECREF(u);
11022 Py_DECREF(v);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011023 assert(_PyUnicode_CheckConsistency(w, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011024 return w;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011025
Benjamin Peterson29060642009-01-31 22:14:21 +000011026 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011027 Py_XDECREF(u);
11028 Py_XDECREF(v);
11029 return NULL;
11030}
11031
Walter Dörwald1ab83302007-05-18 17:15:44 +000011032void
Victor Stinner23e56682011-10-03 03:54:37 +020011033PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011034{
Victor Stinner23e56682011-10-03 03:54:37 +020011035 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011036 Py_UCS4 maxchar, maxchar2;
11037 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011038
11039 if (p_left == NULL) {
11040 if (!PyErr_Occurred())
11041 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011042 return;
11043 }
Victor Stinner23e56682011-10-03 03:54:37 +020011044 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011045 if (right == NULL || left == NULL
11046 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011047 if (!PyErr_Occurred())
11048 PyErr_BadInternalCall();
11049 goto error;
11050 }
11051
Benjamin Petersonbac79492012-01-14 13:34:47 -050011052 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011053 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011054 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011055 goto error;
11056
Victor Stinner488fa492011-12-12 00:01:39 +010011057 /* Shortcuts */
11058 if (left == unicode_empty) {
11059 Py_DECREF(left);
11060 Py_INCREF(right);
11061 *p_left = right;
11062 return;
11063 }
11064 if (right == unicode_empty)
11065 return;
11066
11067 left_len = PyUnicode_GET_LENGTH(left);
11068 right_len = PyUnicode_GET_LENGTH(right);
11069 if (left_len > PY_SSIZE_T_MAX - right_len) {
11070 PyErr_SetString(PyExc_OverflowError,
11071 "strings are too large to concat");
11072 goto error;
11073 }
11074 new_len = left_len + right_len;
11075
11076 if (unicode_modifiable(left)
11077 && PyUnicode_CheckExact(right)
11078 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011079 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11080 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011081 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011082 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011083 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11084 {
11085 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011086 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011087 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011088
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011089 /* copy 'right' into the newly allocated area of 'left' */
11090 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011091 }
Victor Stinner488fa492011-12-12 00:01:39 +010011092 else {
11093 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11094 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011095 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011096
Victor Stinner488fa492011-12-12 00:01:39 +010011097 /* Concat the two Unicode strings */
11098 res = PyUnicode_New(new_len, maxchar);
11099 if (res == NULL)
11100 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011101 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11102 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011103 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011104 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011105 }
11106 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011107 return;
11108
11109error:
Victor Stinner488fa492011-12-12 00:01:39 +010011110 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011111}
11112
11113void
11114PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11115{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011116 PyUnicode_Append(pleft, right);
11117 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011118}
11119
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011120PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011121 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011122\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011123Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011124string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011125interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011126
11127static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011128unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011129{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011130 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011131 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011132 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011133 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011134 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011135 void *buf1, *buf2;
11136 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011137
Jesus Ceaac451502011-04-20 17:09:23 +020011138 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11139 &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011140 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011142 kind1 = PyUnicode_KIND(self);
11143 kind2 = PyUnicode_KIND(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011144 if (kind1 < kind2) {
Christian Heimesd47802e2013-06-29 21:33:36 +020011145 Py_DECREF(substring);
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011146 return PyLong_FromLong(0);
Christian Heimesd47802e2013-06-29 21:33:36 +020011147 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011148 len1 = PyUnicode_GET_LENGTH(self);
11149 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011150 ADJUST_INDICES(start, end, len1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011151 if (end - start < len2) {
11152 Py_DECREF(substring);
11153 return PyLong_FromLong(0);
11154 }
11155 buf1 = PyUnicode_DATA(self);
11156 buf2 = PyUnicode_DATA(substring);
11157 if (kind2 != kind1) {
11158 buf2 = _PyUnicode_AsKind(substring, kind1);
11159 if (!buf2) {
11160 Py_DECREF(substring);
11161 return NULL;
11162 }
11163 }
11164 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011165 case PyUnicode_1BYTE_KIND:
11166 iresult = ucs1lib_count(
11167 ((Py_UCS1*)buf1) + start, end - start,
11168 buf2, len2, PY_SSIZE_T_MAX
11169 );
11170 break;
11171 case PyUnicode_2BYTE_KIND:
11172 iresult = ucs2lib_count(
11173 ((Py_UCS2*)buf1) + start, end - start,
11174 buf2, len2, PY_SSIZE_T_MAX
11175 );
11176 break;
11177 case PyUnicode_4BYTE_KIND:
11178 iresult = ucs4lib_count(
11179 ((Py_UCS4*)buf1) + start, end - start,
11180 buf2, len2, PY_SSIZE_T_MAX
11181 );
11182 break;
11183 default:
11184 assert(0); iresult = 0;
11185 }
11186
11187 result = PyLong_FromSsize_t(iresult);
11188
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011189 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011190 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011191
11192 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011193
Guido van Rossumd57fd912000-03-10 22:53:23 +000011194 return result;
11195}
11196
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011197PyDoc_STRVAR(encode__doc__,
Victor Stinnerc911bbf2010-11-07 19:04:46 +000011198 "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011199\n\
Victor Stinnere14e2122010-11-07 18:41:46 +000011200Encode S using the codec registered for encoding. Default encoding\n\
11201is 'utf-8'. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +000011202handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +000011203a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11204'xmlcharrefreplace' as well as any other name registered with\n\
11205codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206
11207static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011208unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011209{
Benjamin Peterson308d6372009-09-18 21:42:35 +000011210 static char *kwlist[] = {"encoding", "errors", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011211 char *encoding = NULL;
11212 char *errors = NULL;
Guido van Rossum35d94282007-08-27 18:20:11 +000011213
Benjamin Peterson308d6372009-09-18 21:42:35 +000011214 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11215 kwlist, &encoding, &errors))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011216 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011217 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011218}
11219
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011220PyDoc_STRVAR(expandtabs__doc__,
Ezio Melotti745d54d2013-11-16 19:10:57 +020011221 "S.expandtabs(tabsize=8) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011222\n\
11223Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011224If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011225
11226static PyObject*
Ezio Melotti745d54d2013-11-16 19:10:57 +020011227unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011228{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011229 Py_ssize_t i, j, line_pos, src_len, incr;
11230 Py_UCS4 ch;
11231 PyObject *u;
11232 void *src_data, *dest_data;
Ezio Melotti745d54d2013-11-16 19:10:57 +020011233 static char *kwlist[] = {"tabsize", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000011234 int tabsize = 8;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011235 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011236 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011237
Ezio Melotti745d54d2013-11-16 19:10:57 +020011238 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11239 kwlist, &tabsize))
Benjamin Peterson29060642009-01-31 22:14:21 +000011240 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011241
Antoine Pitrou22425222011-10-04 19:10:51 +020011242 if (PyUnicode_READY(self) == -1)
11243 return NULL;
11244
Thomas Wouters7e474022000-07-16 12:04:32 +000011245 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011246 src_len = PyUnicode_GET_LENGTH(self);
11247 i = j = line_pos = 0;
11248 kind = PyUnicode_KIND(self);
11249 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011250 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011251 for (; i < src_len; i++) {
11252 ch = PyUnicode_READ(kind, src_data, i);
11253 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011254 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011255 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011256 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011257 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011258 goto overflow;
11259 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011260 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011261 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011262 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011263 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011264 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011265 goto overflow;
11266 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011267 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011268 if (ch == '\n' || ch == '\r')
11269 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011270 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011271 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011272 if (!found)
11273 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011274
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011276 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011277 if (!u)
11278 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011279 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280
Antoine Pitroue71d5742011-10-04 15:55:09 +020011281 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011282
Antoine Pitroue71d5742011-10-04 15:55:09 +020011283 for (; i < src_len; i++) {
11284 ch = PyUnicode_READ(kind, src_data, i);
11285 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011286 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011287 incr = tabsize - (line_pos % tabsize);
11288 line_pos += incr;
Victor Stinnerda79e632012-02-22 13:37:04 +010011289 FILL(kind, dest_data, ' ', j, incr);
11290 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011291 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011292 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011293 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011294 line_pos++;
11295 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011296 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011297 if (ch == '\n' || ch == '\r')
11298 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011299 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011300 }
11301 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011302 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011303
Antoine Pitroue71d5742011-10-04 15:55:09 +020011304 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011305 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11306 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011307}
11308
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011309PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011310 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011311\n\
11312Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011313such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011314arguments start and end are interpreted as in slice notation.\n\
11315\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011316Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317
11318static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011319unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011320{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011321 /* initialize variables to prevent gcc warning */
11322 PyObject *substring = NULL;
11323 Py_ssize_t start = 0;
11324 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011325 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011326
Jesus Ceaac451502011-04-20 17:09:23 +020011327 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11328 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011329 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011330
Christian Heimesd47802e2013-06-29 21:33:36 +020011331 if (PyUnicode_READY(self) == -1) {
11332 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011333 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011334 }
11335 if (PyUnicode_READY(substring) == -1) {
11336 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011337 return NULL;
Christian Heimesd47802e2013-06-29 21:33:36 +020011338 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011339
Victor Stinner7931d9a2011-11-04 00:22:48 +010011340 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011341
11342 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011343
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011344 if (result == -2)
11345 return NULL;
11346
Christian Heimes217cfd12007-12-02 14:31:20 +000011347 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011348}
11349
11350static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011351unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011352{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011353 void *data;
11354 enum PyUnicode_Kind kind;
11355 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011356
11357 if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11358 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011359 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011360 }
11361 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11362 PyErr_SetString(PyExc_IndexError, "string index out of range");
11363 return NULL;
11364 }
11365 kind = PyUnicode_KIND(self);
11366 data = PyUnicode_DATA(self);
11367 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011368 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011369}
11370
Guido van Rossumc2504932007-09-18 19:42:40 +000011371/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011372 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011373static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011374unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011375{
Guido van Rossumc2504932007-09-18 19:42:40 +000011376 Py_ssize_t len;
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011377 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011378
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011379#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011380 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011381#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011382 if (_PyUnicode_HASH(self) != -1)
11383 return _PyUnicode_HASH(self);
11384 if (PyUnicode_READY(self) == -1)
11385 return -1;
11386 len = PyUnicode_GET_LENGTH(self);
Georg Brandl16fa2a12012-02-21 00:50:13 +010011387 /*
11388 We make the hash of the empty string be 0, rather than using
11389 (prefix ^ suffix), since this slightly obfuscates the hash secret
11390 */
11391 if (len == 0) {
11392 _PyUnicode_HASH(self) = 0;
11393 return 0;
11394 }
Christian Heimes985ecdc2013-11-20 11:46:18 +010011395 x = _Py_HashBytes(PyUnicode_DATA(self),
11396 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011397 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011398 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011399}
11400
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011401PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011402 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011403\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011404Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011405
11406static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011407unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011408{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011409 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011410 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011411 PyObject *substring = NULL;
11412 Py_ssize_t start = 0;
11413 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011414
Jesus Ceaac451502011-04-20 17:09:23 +020011415 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11416 &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011417 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011418
Christian Heimesd47a0452013-06-29 21:21:37 +020011419 if (PyUnicode_READY(self) == -1) {
11420 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011421 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011422 }
11423 if (PyUnicode_READY(substring) == -1) {
11424 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011425 return NULL;
Christian Heimesd47a0452013-06-29 21:21:37 +020011426 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011427
Victor Stinner7931d9a2011-11-04 00:22:48 +010011428 result = any_find_slice(1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011429
11430 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011431
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011432 if (result == -2)
11433 return NULL;
11434
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435 if (result < 0) {
11436 PyErr_SetString(PyExc_ValueError, "substring not found");
11437 return NULL;
11438 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011439
Christian Heimes217cfd12007-12-02 14:31:20 +000011440 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441}
11442
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011443PyDoc_STRVAR(islower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011444 "S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011446Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011447at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011448
11449static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011450unicode_islower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011451{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452 Py_ssize_t i, length;
11453 int kind;
11454 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011455 int cased;
11456
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011457 if (PyUnicode_READY(self) == -1)
11458 return NULL;
11459 length = PyUnicode_GET_LENGTH(self);
11460 kind = PyUnicode_KIND(self);
11461 data = PyUnicode_DATA(self);
11462
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011464 if (length == 1)
11465 return PyBool_FromLong(
11466 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011467
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011468 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011469 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011470 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011471
Guido van Rossumd57fd912000-03-10 22:53:23 +000011472 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011473 for (i = 0; i < length; i++) {
11474 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011475
Benjamin Peterson29060642009-01-31 22:14:21 +000011476 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11477 return PyBool_FromLong(0);
11478 else if (!cased && Py_UNICODE_ISLOWER(ch))
11479 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011480 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011481 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011482}
11483
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011484PyDoc_STRVAR(isupper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011485 "S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011486\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011487Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011488at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011489
11490static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011491unicode_isupper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011493 Py_ssize_t i, length;
11494 int kind;
11495 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496 int cased;
11497
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011498 if (PyUnicode_READY(self) == -1)
11499 return NULL;
11500 length = PyUnicode_GET_LENGTH(self);
11501 kind = PyUnicode_KIND(self);
11502 data = PyUnicode_DATA(self);
11503
Guido van Rossumd57fd912000-03-10 22:53:23 +000011504 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011505 if (length == 1)
11506 return PyBool_FromLong(
11507 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011509 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011510 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011511 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011512
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011514 for (i = 0; i < length; i++) {
11515 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011516
Benjamin Peterson29060642009-01-31 22:14:21 +000011517 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11518 return PyBool_FromLong(0);
11519 else if (!cased && Py_UNICODE_ISUPPER(ch))
11520 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011521 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011522 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523}
11524
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011525PyDoc_STRVAR(istitle__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011526 "S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011528Return True if S is a titlecased string and there is at least one\n\
11529character in S, i.e. upper- and titlecase characters may only\n\
11530follow uncased characters and lowercase characters only cased ones.\n\
11531Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011532
11533static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011534unicode_istitle(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011535{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011536 Py_ssize_t i, length;
11537 int kind;
11538 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011539 int cased, previous_is_cased;
11540
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011541 if (PyUnicode_READY(self) == -1)
11542 return NULL;
11543 length = PyUnicode_GET_LENGTH(self);
11544 kind = PyUnicode_KIND(self);
11545 data = PyUnicode_DATA(self);
11546
Guido van Rossumd57fd912000-03-10 22:53:23 +000011547 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011548 if (length == 1) {
11549 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11550 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11551 (Py_UNICODE_ISUPPER(ch) != 0));
11552 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011553
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011554 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011555 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011556 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011557
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558 cased = 0;
11559 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011560 for (i = 0; i < length; i++) {
11561 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011562
Benjamin Peterson29060642009-01-31 22:14:21 +000011563 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11564 if (previous_is_cased)
11565 return PyBool_FromLong(0);
11566 previous_is_cased = 1;
11567 cased = 1;
11568 }
11569 else if (Py_UNICODE_ISLOWER(ch)) {
11570 if (!previous_is_cased)
11571 return PyBool_FromLong(0);
11572 previous_is_cased = 1;
11573 cased = 1;
11574 }
11575 else
11576 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011578 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011579}
11580
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011581PyDoc_STRVAR(isspace__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011582 "S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011583\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011584Return True if all characters in S are whitespace\n\
11585and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011586
11587static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011588unicode_isspace(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011589{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011590 Py_ssize_t i, length;
11591 int kind;
11592 void *data;
11593
11594 if (PyUnicode_READY(self) == -1)
11595 return NULL;
11596 length = PyUnicode_GET_LENGTH(self);
11597 kind = PyUnicode_KIND(self);
11598 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599
Guido van Rossumd57fd912000-03-10 22:53:23 +000011600 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011601 if (length == 1)
11602 return PyBool_FromLong(
11603 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011604
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011605 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011606 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011607 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011608
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011609 for (i = 0; i < length; i++) {
11610 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011611 if (!Py_UNICODE_ISSPACE(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011612 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011614 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011615}
11616
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011617PyDoc_STRVAR(isalpha__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011618 "S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011619\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011620Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011621and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011622
11623static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011624unicode_isalpha(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011625{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011626 Py_ssize_t i, length;
11627 int kind;
11628 void *data;
11629
11630 if (PyUnicode_READY(self) == -1)
11631 return NULL;
11632 length = PyUnicode_GET_LENGTH(self);
11633 kind = PyUnicode_KIND(self);
11634 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011635
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011636 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011637 if (length == 1)
11638 return PyBool_FromLong(
11639 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011640
11641 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011642 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011643 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011645 for (i = 0; i < length; i++) {
11646 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011647 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011648 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011649 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011650}
11651
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011652PyDoc_STRVAR(isalnum__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011653 "S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011654\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011655Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011656and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011657
11658static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011659unicode_isalnum(PyObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011660{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011661 int kind;
11662 void *data;
11663 Py_ssize_t len, i;
11664
11665 if (PyUnicode_READY(self) == -1)
11666 return NULL;
11667
11668 kind = PyUnicode_KIND(self);
11669 data = PyUnicode_DATA(self);
11670 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011671
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011672 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011673 if (len == 1) {
11674 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11675 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11676 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011677
11678 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011679 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011680 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011681
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011682 for (i = 0; i < len; i++) {
11683 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011684 if (!Py_UNICODE_ISALNUM(ch))
Benjamin Peterson29060642009-01-31 22:14:21 +000011685 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011686 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011687 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011688}
11689
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011690PyDoc_STRVAR(isdecimal__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011691 "S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011692\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011693Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011694False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011695
11696static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011697unicode_isdecimal(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011698{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011699 Py_ssize_t i, length;
11700 int kind;
11701 void *data;
11702
11703 if (PyUnicode_READY(self) == -1)
11704 return NULL;
11705 length = PyUnicode_GET_LENGTH(self);
11706 kind = PyUnicode_KIND(self);
11707 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011708
Guido van Rossumd57fd912000-03-10 22:53:23 +000011709 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011710 if (length == 1)
11711 return PyBool_FromLong(
11712 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011713
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011714 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011715 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011716 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011717
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011718 for (i = 0; i < length; i++) {
11719 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011720 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011722 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011723}
11724
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011725PyDoc_STRVAR(isdigit__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011726 "S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011727\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +000011728Return True if all characters in S are digits\n\
11729and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011730
11731static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011732unicode_isdigit(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011733{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011734 Py_ssize_t i, length;
11735 int kind;
11736 void *data;
11737
11738 if (PyUnicode_READY(self) == -1)
11739 return NULL;
11740 length = PyUnicode_GET_LENGTH(self);
11741 kind = PyUnicode_KIND(self);
11742 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011745 if (length == 1) {
11746 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11747 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11748 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011750 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011751 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011752 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011753
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011754 for (i = 0; i < length; i++) {
11755 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011756 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011758 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759}
11760
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011761PyDoc_STRVAR(isnumeric__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011762 "S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +000011764Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011765False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766
11767static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011768unicode_isnumeric(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011769{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011770 Py_ssize_t i, length;
11771 int kind;
11772 void *data;
11773
11774 if (PyUnicode_READY(self) == -1)
11775 return NULL;
11776 length = PyUnicode_GET_LENGTH(self);
11777 kind = PyUnicode_KIND(self);
11778 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011781 if (length == 1)
11782 return PyBool_FromLong(
11783 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011785 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011786 if (length == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000011787 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011788
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011789 for (i = 0; i < length; i++) {
11790 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011791 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011793 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011794}
11795
Martin v. Löwis47383402007-08-15 07:32:56 +000011796int
11797PyUnicode_IsIdentifier(PyObject *self)
11798{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011799 int kind;
11800 void *data;
11801 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011802 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000011803
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011804 if (PyUnicode_READY(self) == -1) {
11805 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000011806 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011807 }
11808
11809 /* Special case for empty strings */
11810 if (PyUnicode_GET_LENGTH(self) == 0)
11811 return 0;
11812 kind = PyUnicode_KIND(self);
11813 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000011814
11815 /* PEP 3131 says that the first character must be in
11816 XID_Start and subsequent characters in XID_Continue,
11817 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000011818 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000011819 letters, digits, underscore). However, given the current
11820 definition of XID_Start and XID_Continue, it is sufficient
11821 to check just for these, except that _ must be allowed
11822 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011823 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050011824 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000011825 return 0;
11826
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040011827 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011828 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000011829 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000011830 return 1;
11831}
11832
11833PyDoc_STRVAR(isidentifier__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011834 "S.isidentifier() -> bool\n\
Martin v. Löwis47383402007-08-15 07:32:56 +000011835\n\
11836Return True if S is a valid identifier according\n\
Raymond Hettinger378170d2013-03-23 08:21:12 -070011837to the language definition.\n\
11838\n\
11839Use keyword.iskeyword() to test for reserved identifiers\n\
11840such as \"def\" and \"class\".\n");
Martin v. Löwis47383402007-08-15 07:32:56 +000011841
11842static PyObject*
11843unicode_isidentifier(PyObject *self)
11844{
11845 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11846}
11847
Georg Brandl559e5d72008-06-11 18:37:52 +000011848PyDoc_STRVAR(isprintable__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011849 "S.isprintable() -> bool\n\
Georg Brandl559e5d72008-06-11 18:37:52 +000011850\n\
11851Return True if all characters in S are considered\n\
11852printable in repr() or S is empty, False otherwise.");
11853
11854static PyObject*
11855unicode_isprintable(PyObject *self)
11856{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011857 Py_ssize_t i, length;
11858 int kind;
11859 void *data;
11860
11861 if (PyUnicode_READY(self) == -1)
11862 return NULL;
11863 length = PyUnicode_GET_LENGTH(self);
11864 kind = PyUnicode_KIND(self);
11865 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000011866
11867 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011868 if (length == 1)
11869 return PyBool_FromLong(
11870 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000011871
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011872 for (i = 0; i < length; i++) {
11873 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000011874 Py_RETURN_FALSE;
11875 }
11876 }
11877 Py_RETURN_TRUE;
11878}
11879
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011880PyDoc_STRVAR(join__doc__,
Georg Brandl495f7b52009-10-27 15:28:25 +000011881 "S.join(iterable) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011882\n\
11883Return a string which is the concatenation of the strings in the\n\
Georg Brandl495f7b52009-10-27 15:28:25 +000011884iterable. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885
11886static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011887unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000011889 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890}
11891
Martin v. Löwis18e16552006-02-15 17:27:45 +000011892static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011893unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011894{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011895 if (PyUnicode_READY(self) == -1)
11896 return -1;
11897 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898}
11899
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011900PyDoc_STRVAR(ljust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011901 "S.ljust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000011903Return S left-justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011904done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905
11906static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020011907unicode_ljust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011909 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011910 Py_UCS4 fillchar = ' ';
11911
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011912 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913 return NULL;
11914
Benjamin Petersonbac79492012-01-14 13:34:47 -050011915 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010011916 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917
Victor Stinnerc4b49542011-12-11 22:44:26 +010011918 if (PyUnicode_GET_LENGTH(self) >= width)
11919 return unicode_result_unchanged(self);
11920
11921 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922}
11923
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011924PyDoc_STRVAR(lower__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011925 "S.lower() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011927Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928
11929static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020011930unicode_lower(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011932 if (PyUnicode_READY(self) == -1)
11933 return NULL;
11934 if (PyUnicode_IS_ASCII(self))
11935 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011936 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937}
11938
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011939#define LEFTSTRIP 0
11940#define RIGHTSTRIP 1
11941#define BOTHSTRIP 2
11942
11943/* Arrays indexed by above */
11944static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11945
11946#define STRIPNAME(i) (stripformat[i]+3)
11947
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011948/* externally visible for str.strip(unicode) */
11949PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011950_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011951{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011952 void *data;
11953 int kind;
11954 Py_ssize_t i, j, len;
11955 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011956 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011957
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011958 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11959 return NULL;
11960
11961 kind = PyUnicode_KIND(self);
11962 data = PyUnicode_DATA(self);
11963 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020011964 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011965 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11966 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020011967 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011968
Benjamin Peterson14339b62009-01-31 16:36:08 +000011969 i = 0;
11970 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011971 while (i < len) {
11972 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11973 if (!BLOOM(sepmask, ch))
11974 break;
11975 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11976 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011977 i++;
11978 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011979 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011980
Benjamin Peterson14339b62009-01-31 16:36:08 +000011981 j = len;
11982 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020011983 j--;
11984 while (j >= i) {
11985 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11986 if (!BLOOM(sepmask, ch))
11987 break;
11988 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11989 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000011990 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020011991 }
11992
Benjamin Peterson29060642009-01-31 22:14:21 +000011993 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011994 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000011995
Victor Stinner7931d9a2011-11-04 00:22:48 +010011996 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011997}
11998
11999PyObject*
12000PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12001{
12002 unsigned char *data;
12003 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012004 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005
Victor Stinnerde636f32011-10-01 03:55:54 +020012006 if (PyUnicode_READY(self) == -1)
12007 return NULL;
12008
Victor Stinner684d5fd2012-05-03 02:32:34 +020012009 length = PyUnicode_GET_LENGTH(self);
12010 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012011
Victor Stinner684d5fd2012-05-03 02:32:34 +020012012 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012013 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012014
Victor Stinnerde636f32011-10-01 03:55:54 +020012015 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012016 PyErr_SetString(PyExc_IndexError, "string index out of range");
12017 return NULL;
12018 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012019 if (start >= length || end < start)
12020 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012021
Victor Stinner684d5fd2012-05-03 02:32:34 +020012022 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012023 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012024 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012025 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012026 }
12027 else {
12028 kind = PyUnicode_KIND(self);
12029 data = PyUnicode_1BYTE_DATA(self);
12030 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012031 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012032 length);
12033 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012034}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012035
12036static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012037do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012038{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012039 Py_ssize_t len, i, j;
12040
12041 if (PyUnicode_READY(self) == -1)
12042 return NULL;
12043
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012044 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012045
Victor Stinnercc7af722013-04-09 22:39:24 +020012046 if (PyUnicode_IS_ASCII(self)) {
12047 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12048
12049 i = 0;
12050 if (striptype != RIGHTSTRIP) {
12051 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012052 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012053 if (!_Py_ascii_whitespace[ch])
12054 break;
12055 i++;
12056 }
12057 }
12058
12059 j = len;
12060 if (striptype != LEFTSTRIP) {
12061 j--;
12062 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012063 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012064 if (!_Py_ascii_whitespace[ch])
12065 break;
12066 j--;
12067 }
12068 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012069 }
12070 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012071 else {
12072 int kind = PyUnicode_KIND(self);
12073 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012074
Victor Stinnercc7af722013-04-09 22:39:24 +020012075 i = 0;
12076 if (striptype != RIGHTSTRIP) {
12077 while (i < len) {
12078 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12079 if (!Py_UNICODE_ISSPACE(ch))
12080 break;
12081 i++;
12082 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012083 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012084
12085 j = len;
12086 if (striptype != LEFTSTRIP) {
12087 j--;
12088 while (j >= i) {
12089 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12090 if (!Py_UNICODE_ISSPACE(ch))
12091 break;
12092 j--;
12093 }
12094 j++;
12095 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012096 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012097
Victor Stinner7931d9a2011-11-04 00:22:48 +010012098 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012099}
12100
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012101
12102static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012103do_argstrip(PyObject *self, int striptype, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012104{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012105 PyObject *sep = NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012106
Serhiy Storchakac6792272013-10-19 21:03:34 +030012107 if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012108 return NULL;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012109
Benjamin Peterson14339b62009-01-31 16:36:08 +000012110 if (sep != NULL && sep != Py_None) {
12111 if (PyUnicode_Check(sep))
12112 return _PyUnicode_XStrip(self, striptype, sep);
12113 else {
12114 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012115 "%s arg must be None or str",
12116 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012117 return NULL;
12118 }
12119 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012120
Benjamin Peterson14339b62009-01-31 16:36:08 +000012121 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012122}
12123
12124
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012125PyDoc_STRVAR(strip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012126 "S.strip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012127\n\
12128Return a copy of the string S with leading and trailing\n\
12129whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012130If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012131
12132static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012133unicode_strip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012134{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012135 if (PyTuple_GET_SIZE(args) == 0)
12136 return do_strip(self, BOTHSTRIP); /* Common case */
12137 else
12138 return do_argstrip(self, BOTHSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012139}
12140
12141
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012142PyDoc_STRVAR(lstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012143 "S.lstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012144\n\
12145Return a copy of the string S with leading whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012146If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012147
12148static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012149unicode_lstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012150{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012151 if (PyTuple_GET_SIZE(args) == 0)
12152 return do_strip(self, LEFTSTRIP); /* Common case */
12153 else
12154 return do_argstrip(self, LEFTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012155}
12156
12157
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012158PyDoc_STRVAR(rstrip__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012159 "S.rstrip([chars]) -> str\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012160\n\
12161Return a copy of the string S with trailing whitespace removed.\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000012162If chars is given and not None, remove characters in chars instead.");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012163
12164static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012165unicode_rstrip(PyObject *self, PyObject *args)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012166{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012167 if (PyTuple_GET_SIZE(args) == 0)
12168 return do_strip(self, RIGHTSTRIP); /* Common case */
12169 else
12170 return do_argstrip(self, RIGHTSTRIP, args);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012171}
12172
12173
Guido van Rossumd57fd912000-03-10 22:53:23 +000012174static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012175unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012176{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012177 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012178 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012179
Serhiy Storchaka05997252013-01-26 12:14:02 +020012180 if (len < 1)
12181 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012182
Victor Stinnerc4b49542011-12-11 22:44:26 +010012183 /* no repeat, return original string */
12184 if (len == 1)
12185 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012186
Benjamin Petersonbac79492012-01-14 13:34:47 -050012187 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012188 return NULL;
12189
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012190 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012191 PyErr_SetString(PyExc_OverflowError,
12192 "repeated string is too long");
12193 return NULL;
12194 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012195 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012196
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012197 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012198 if (!u)
12199 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012200 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012201
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012202 if (PyUnicode_GET_LENGTH(str) == 1) {
12203 const int kind = PyUnicode_KIND(str);
12204 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012205 if (kind == PyUnicode_1BYTE_KIND) {
12206 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012207 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012208 }
12209 else if (kind == PyUnicode_2BYTE_KIND) {
12210 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012211 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012212 ucs2[n] = fill_char;
12213 } else {
12214 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12215 assert(kind == PyUnicode_4BYTE_KIND);
12216 for (n = 0; n < len; ++n)
12217 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012218 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012219 }
12220 else {
12221 /* number of characters copied this far */
12222 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012223 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012224 char *to = (char *) PyUnicode_DATA(u);
12225 Py_MEMCPY(to, PyUnicode_DATA(str),
12226 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012227 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012228 n = (done <= nchars-done) ? done : nchars-done;
12229 Py_MEMCPY(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012230 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012231 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232 }
12233
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012234 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012235 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012236}
12237
Alexander Belopolsky40018472011-02-26 01:02:56 +000012238PyObject *
12239PyUnicode_Replace(PyObject *obj,
12240 PyObject *subobj,
12241 PyObject *replobj,
12242 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012243{
12244 PyObject *self;
12245 PyObject *str1;
12246 PyObject *str2;
12247 PyObject *result;
12248
12249 self = PyUnicode_FromObject(obj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012250 if (self == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000012251 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012252 str1 = PyUnicode_FromObject(subobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012253 if (str1 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012254 Py_DECREF(self);
12255 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256 }
12257 str2 = PyUnicode_FromObject(replobj);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012258 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012259 Py_DECREF(self);
12260 Py_DECREF(str1);
12261 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012262 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012263 if (PyUnicode_READY(self) == -1 ||
12264 PyUnicode_READY(str1) == -1 ||
12265 PyUnicode_READY(str2) == -1)
12266 result = NULL;
12267 else
12268 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012269 Py_DECREF(self);
12270 Py_DECREF(str1);
12271 Py_DECREF(str2);
12272 return result;
12273}
12274
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012275PyDoc_STRVAR(replace__doc__,
Ezio Melottic1897e72010-06-26 18:50:39 +000012276 "S.replace(old, new[, count]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012277\n\
12278Return a copy of S with all occurrences of substring\n\
Georg Brandlf08a9dd2008-06-10 16:57:31 +000012279old replaced by new. If the optional argument count is\n\
12280given, only the first count occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012281
12282static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012283unicode_replace(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012284{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012285 PyObject *str1;
12286 PyObject *str2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012287 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012288 PyObject *result;
12289
Martin v. Löwis18e16552006-02-15 17:27:45 +000012290 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012291 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060012292 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012293 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012294 str1 = PyUnicode_FromObject(str1);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012295 if (str1 == NULL)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012296 return NULL;
12297 str2 = PyUnicode_FromObject(str2);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012298 if (str2 == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012299 Py_DECREF(str1);
12300 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +000012301 }
Benjamin Peterson22a29702012-01-02 09:00:30 -060012302 if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12303 result = NULL;
12304 else
12305 result = replace(self, str1, str2, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012306
12307 Py_DECREF(str1);
12308 Py_DECREF(str2);
12309 return result;
12310}
12311
Alexander Belopolsky40018472011-02-26 01:02:56 +000012312static PyObject *
12313unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012314{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012315 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012316 Py_ssize_t isize;
12317 Py_ssize_t osize, squote, dquote, i, o;
12318 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012319 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012320 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012321
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012322 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012323 return NULL;
12324
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012325 isize = PyUnicode_GET_LENGTH(unicode);
12326 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012327
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012328 /* Compute length of output, quote characters, and
12329 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012330 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012331 max = 127;
12332 squote = dquote = 0;
12333 ikind = PyUnicode_KIND(unicode);
12334 for (i = 0; i < isize; i++) {
12335 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012336 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012337 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012338 case '\'': squote++; break;
12339 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012340 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012341 incr = 2;
12342 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012343 default:
12344 /* Fast-path ASCII */
12345 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012346 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012347 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012348 ;
12349 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012350 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012351 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012352 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012353 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012354 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012355 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012356 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012357 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012358 if (osize > PY_SSIZE_T_MAX - incr) {
12359 PyErr_SetString(PyExc_OverflowError,
12360 "string is too long to generate repr");
12361 return NULL;
12362 }
12363 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012364 }
12365
12366 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012367 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012368 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012369 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012370 if (dquote)
12371 /* Both squote and dquote present. Use squote,
12372 and escape them */
12373 osize += squote;
12374 else
12375 quote = '"';
12376 }
Victor Stinner55c08782013-04-14 18:45:39 +020012377 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012378
12379 repr = PyUnicode_New(osize, max);
12380 if (repr == NULL)
12381 return NULL;
12382 okind = PyUnicode_KIND(repr);
12383 odata = PyUnicode_DATA(repr);
12384
12385 PyUnicode_WRITE(okind, odata, 0, quote);
12386 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012387 if (unchanged) {
12388 _PyUnicode_FastCopyCharacters(repr, 1,
12389 unicode, 0,
12390 isize);
12391 }
12392 else {
12393 for (i = 0, o = 1; i < isize; i++) {
12394 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012395
Victor Stinner55c08782013-04-14 18:45:39 +020012396 /* Escape quotes and backslashes */
12397 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012398 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012399 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012400 continue;
12401 }
12402
12403 /* Map special whitespace to '\t', \n', '\r' */
12404 if (ch == '\t') {
12405 PyUnicode_WRITE(okind, odata, o++, '\\');
12406 PyUnicode_WRITE(okind, odata, o++, 't');
12407 }
12408 else if (ch == '\n') {
12409 PyUnicode_WRITE(okind, odata, o++, '\\');
12410 PyUnicode_WRITE(okind, odata, o++, 'n');
12411 }
12412 else if (ch == '\r') {
12413 PyUnicode_WRITE(okind, odata, o++, '\\');
12414 PyUnicode_WRITE(okind, odata, o++, 'r');
12415 }
12416
12417 /* Map non-printable US ASCII to '\xhh' */
12418 else if (ch < ' ' || ch == 0x7F) {
12419 PyUnicode_WRITE(okind, odata, o++, '\\');
12420 PyUnicode_WRITE(okind, odata, o++, 'x');
12421 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12422 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12423 }
12424
12425 /* Copy ASCII characters as-is */
12426 else if (ch < 0x7F) {
12427 PyUnicode_WRITE(okind, odata, o++, ch);
12428 }
12429
12430 /* Non-ASCII characters */
12431 else {
12432 /* Map Unicode whitespace and control characters
12433 (categories Z* and C* except ASCII space)
12434 */
12435 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12436 PyUnicode_WRITE(okind, odata, o++, '\\');
12437 /* Map 8-bit characters to '\xhh' */
12438 if (ch <= 0xff) {
12439 PyUnicode_WRITE(okind, odata, o++, 'x');
12440 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12441 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12442 }
12443 /* Map 16-bit characters to '\uxxxx' */
12444 else if (ch <= 0xffff) {
12445 PyUnicode_WRITE(okind, odata, o++, 'u');
12446 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12447 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12448 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12449 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12450 }
12451 /* Map 21-bit characters to '\U00xxxxxx' */
12452 else {
12453 PyUnicode_WRITE(okind, odata, o++, 'U');
12454 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12455 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12456 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12457 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12458 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12459 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12460 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12461 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12462 }
12463 }
12464 /* Copy characters as-is */
12465 else {
12466 PyUnicode_WRITE(okind, odata, o++, ch);
12467 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012468 }
12469 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012470 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012471 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012472 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012473 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012474}
12475
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012476PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012477 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012478\n\
12479Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012480such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012481arguments start and end are interpreted as in slice notation.\n\
12482\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012483Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012484
12485static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012486unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012487{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012488 /* initialize variables to prevent gcc warning */
12489 PyObject *substring = NULL;
12490 Py_ssize_t start = 0;
12491 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012492 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012493
Jesus Ceaac451502011-04-20 17:09:23 +020012494 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12495 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012496 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012497
Christian Heimesea71a522013-06-29 21:17:34 +020012498 if (PyUnicode_READY(self) == -1) {
12499 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012500 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012501 }
12502 if (PyUnicode_READY(substring) == -1) {
12503 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012504 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012505 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012506
Victor Stinner7931d9a2011-11-04 00:22:48 +010012507 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012508
12509 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012510
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012511 if (result == -2)
12512 return NULL;
12513
Christian Heimes217cfd12007-12-02 14:31:20 +000012514 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515}
12516
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012517PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012518 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012519\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012520Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521
12522static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012523unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012525 /* initialize variables to prevent gcc warning */
12526 PyObject *substring = NULL;
12527 Py_ssize_t start = 0;
12528 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012529 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530
Jesus Ceaac451502011-04-20 17:09:23 +020012531 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12532 &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012533 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012534
Christian Heimesea71a522013-06-29 21:17:34 +020012535 if (PyUnicode_READY(self) == -1) {
12536 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012537 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012538 }
12539 if (PyUnicode_READY(substring) == -1) {
12540 Py_DECREF(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012541 return NULL;
Christian Heimesea71a522013-06-29 21:17:34 +020012542 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012543
Victor Stinner7931d9a2011-11-04 00:22:48 +010012544 result = any_find_slice(-1, self, substring, start, end);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012545
12546 Py_DECREF(substring);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012548 if (result == -2)
12549 return NULL;
12550
Guido van Rossumd57fd912000-03-10 22:53:23 +000012551 if (result < 0) {
12552 PyErr_SetString(PyExc_ValueError, "substring not found");
12553 return NULL;
12554 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012555
Christian Heimes217cfd12007-12-02 14:31:20 +000012556 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012557}
12558
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012559PyDoc_STRVAR(rjust__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012560 "S.rjust(width[, fillchar]) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012561\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012562Return S right-justified in a string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012563done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012564
12565static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020012566unicode_rjust(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012567{
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000012568 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012569 Py_UCS4 fillchar = ' ';
12570
Victor Stinnere9a29352011-10-01 02:14:59 +020012571 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012572 return NULL;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000012573
Benjamin Petersonbac79492012-01-14 13:34:47 -050012574 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012575 return NULL;
12576
Victor Stinnerc4b49542011-12-11 22:44:26 +010012577 if (PyUnicode_GET_LENGTH(self) >= width)
12578 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012579
Victor Stinnerc4b49542011-12-11 22:44:26 +010012580 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012581}
12582
Alexander Belopolsky40018472011-02-26 01:02:56 +000012583PyObject *
12584PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012585{
12586 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +000012587
Guido van Rossumd57fd912000-03-10 22:53:23 +000012588 s = PyUnicode_FromObject(s);
12589 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012590 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012591 if (sep != NULL) {
12592 sep = PyUnicode_FromObject(sep);
12593 if (sep == NULL) {
12594 Py_DECREF(s);
12595 return NULL;
12596 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012597 }
12598
Victor Stinner9310abb2011-10-05 00:59:23 +020012599 result = split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012600
12601 Py_DECREF(s);
12602 Py_XDECREF(sep);
12603 return result;
12604}
12605
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012606PyDoc_STRVAR(split__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012607 "S.split(sep=None, maxsplit=-1) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012608\n\
12609Return a list of the words in S, using sep as the\n\
12610delimiter string. If maxsplit is given, at most maxsplit\n\
Alexandre Vassalotti5f8ced22008-05-16 00:03:33 +000012611splits are done. If sep is not specified or is None, any\n\
Alexandre Vassalotti8ae3e052008-05-16 00:41:41 +000012612whitespace string is a separator and empty strings are\n\
12613removed from the result.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012614
12615static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012616unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012617{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012618 static char *kwlist[] = {"sep", "maxsplit", 0};
Guido van Rossumd57fd912000-03-10 22:53:23 +000012619 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012620 Py_ssize_t maxcount = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012621
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012622 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12623 kwlist, &substring, &maxcount))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012624 return NULL;
12625
12626 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012627 return split(self, NULL, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012628 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012629 return split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012630 else
Victor Stinner7931d9a2011-11-04 00:22:48 +010012631 return PyUnicode_Split(self, substring, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012632}
12633
Thomas Wouters477c8d52006-05-27 19:21:47 +000012634PyObject *
12635PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12636{
12637 PyObject* str_obj;
12638 PyObject* sep_obj;
12639 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012640 int kind1, kind2;
12641 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012642 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012643
12644 str_obj = PyUnicode_FromObject(str_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012645 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012646 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012647 sep_obj = PyUnicode_FromObject(sep_in);
Benjamin Peterson22a29702012-01-02 09:00:30 -060012648 if (!sep_obj) {
12649 Py_DECREF(str_obj);
12650 return NULL;
12651 }
12652 if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12653 Py_DECREF(sep_obj);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012654 Py_DECREF(str_obj);
12655 return NULL;
12656 }
12657
Victor Stinner14f8f022011-10-05 20:58:25 +020012658 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012659 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012660 len1 = PyUnicode_GET_LENGTH(str_obj);
12661 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012662 if (kind1 < kind2 || len1 < len2) {
12663 _Py_INCREF_UNICODE_EMPTY();
12664 if (!unicode_empty)
12665 out = NULL;
12666 else {
12667 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12668 Py_DECREF(unicode_empty);
12669 }
12670 Py_DECREF(sep_obj);
12671 Py_DECREF(str_obj);
12672 return out;
12673 }
12674 buf1 = PyUnicode_DATA(str_obj);
12675 buf2 = PyUnicode_DATA(sep_obj);
12676 if (kind2 != kind1) {
12677 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12678 if (!buf2)
12679 goto onError;
12680 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012681
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012682 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012683 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012684 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12685 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12686 else
12687 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012688 break;
12689 case PyUnicode_2BYTE_KIND:
12690 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12691 break;
12692 case PyUnicode_4BYTE_KIND:
12693 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12694 break;
12695 default:
12696 assert(0);
12697 out = 0;
12698 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012699
12700 Py_DECREF(sep_obj);
12701 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012702 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012703 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012704
12705 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012706 onError:
12707 Py_DECREF(sep_obj);
12708 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012709 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012710 PyMem_Free(buf2);
12711 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012712}
12713
12714
12715PyObject *
12716PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12717{
12718 PyObject* str_obj;
12719 PyObject* sep_obj;
12720 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012721 int kind1, kind2;
12722 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012723 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012724
12725 str_obj = PyUnicode_FromObject(str_in);
12726 if (!str_obj)
Benjamin Peterson29060642009-01-31 22:14:21 +000012727 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012728 sep_obj = PyUnicode_FromObject(sep_in);
12729 if (!sep_obj) {
12730 Py_DECREF(str_obj);
12731 return NULL;
12732 }
12733
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012734 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012735 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012736 len1 = PyUnicode_GET_LENGTH(str_obj);
12737 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012738 if (kind1 < kind2 || len1 < len2) {
12739 _Py_INCREF_UNICODE_EMPTY();
12740 if (!unicode_empty)
12741 out = NULL;
12742 else {
12743 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12744 Py_DECREF(unicode_empty);
12745 }
12746 Py_DECREF(sep_obj);
12747 Py_DECREF(str_obj);
12748 return out;
12749 }
12750 buf1 = PyUnicode_DATA(str_obj);
12751 buf2 = PyUnicode_DATA(sep_obj);
12752 if (kind2 != kind1) {
12753 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12754 if (!buf2)
12755 goto onError;
12756 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012757
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012758 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012759 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012760 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12761 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12762 else
12763 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012764 break;
12765 case PyUnicode_2BYTE_KIND:
12766 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12767 break;
12768 case PyUnicode_4BYTE_KIND:
12769 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12770 break;
12771 default:
12772 assert(0);
12773 out = 0;
12774 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012775
12776 Py_DECREF(sep_obj);
12777 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012778 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012779 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012780
12781 return out;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012782 onError:
12783 Py_DECREF(sep_obj);
12784 Py_DECREF(str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012785 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012786 PyMem_Free(buf2);
12787 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012788}
12789
12790PyDoc_STRVAR(partition__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012791 "S.partition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012792\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012793Search for the separator sep in S, and return the part before it,\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012794the separator itself, and the part after it. If the separator is not\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012795found, return S and two empty strings.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012796
12797static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012798unicode_partition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012799{
Victor Stinner9310abb2011-10-05 00:59:23 +020012800 return PyUnicode_Partition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012801}
12802
12803PyDoc_STRVAR(rpartition__doc__,
Ezio Melotti5b2b2422010-01-25 11:58:28 +000012804 "S.rpartition(sep) -> (head, sep, tail)\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012805\n\
Georg Brandl17cb8a82008-05-30 08:20:09 +000012806Search for the separator sep in S, starting at the end of S, and return\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000012807the part before it, the separator itself, and the part after it. If the\n\
Benjamin Petersonf10a79a2008-10-11 00:49:57 +000012808separator is not found, return two empty strings and S.");
Thomas Wouters477c8d52006-05-27 19:21:47 +000012809
12810static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012811unicode_rpartition(PyObject *self, PyObject *separator)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012812{
Victor Stinner9310abb2011-10-05 00:59:23 +020012813 return PyUnicode_RPartition(self, separator);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012814}
12815
Alexander Belopolsky40018472011-02-26 01:02:56 +000012816PyObject *
12817PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012818{
12819 PyObject *result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012820
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012821 s = PyUnicode_FromObject(s);
12822 if (s == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000012823 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +000012824 if (sep != NULL) {
12825 sep = PyUnicode_FromObject(sep);
12826 if (sep == NULL) {
12827 Py_DECREF(s);
12828 return NULL;
12829 }
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012830 }
12831
Victor Stinner9310abb2011-10-05 00:59:23 +020012832 result = rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012833
12834 Py_DECREF(s);
12835 Py_XDECREF(sep);
12836 return result;
12837}
12838
12839PyDoc_STRVAR(rsplit__doc__,
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012840 "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012841\n\
12842Return a list of the words in S, using sep as the\n\
12843delimiter string, starting at the end of the string and\n\
12844working to the front. If maxsplit is given, at most maxsplit\n\
12845splits are done. If sep is not specified, any whitespace string\n\
12846is a separator.");
12847
12848static PyObject*
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012849unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012850{
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012851 static char *kwlist[] = {"sep", "maxsplit", 0};
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012852 PyObject *substring = Py_None;
Martin v. Löwis18e16552006-02-15 17:27:45 +000012853 Py_ssize_t maxcount = -1;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012854
Ezio Melotticda6b6d2012-02-26 09:39:55 +020012855 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12856 kwlist, &substring, &maxcount))
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012857 return NULL;
12858
12859 if (substring == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +000012860 return rsplit(self, NULL, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012861 else if (PyUnicode_Check(substring))
Victor Stinner9310abb2011-10-05 00:59:23 +020012862 return rsplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012863 else
Victor Stinner9310abb2011-10-05 00:59:23 +020012864 return PyUnicode_RSplit(self, substring, maxcount);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000012865}
12866
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012867PyDoc_STRVAR(splitlines__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012868 "S.splitlines([keepends]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012869\n\
12870Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +000012871Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012872is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012873
12874static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012875unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012876{
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012877 static char *kwlist[] = {"keepends", 0};
Guido van Rossum86662912000-04-11 15:38:46 +000012878 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012879
Mark Dickinson0d5f6ad2011-09-24 09:14:39 +010012880 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12881 kwlist, &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012882 return NULL;
12883
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012884 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012885}
12886
12887static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000012888PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012889{
Victor Stinnerc4b49542011-12-11 22:44:26 +010012890 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012891}
12892
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012893PyDoc_STRVAR(swapcase__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012894 "S.swapcase() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012895\n\
12896Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012897and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012898
12899static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020012900unicode_swapcase(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012901{
Benjamin Petersoneea48462012-01-16 14:28:50 -050012902 if (PyUnicode_READY(self) == -1)
12903 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012904 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012905}
12906
Larry Hastings61272b72014-01-07 12:41:53 -080012907/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000012908
Larry Hastings31826802013-10-19 00:09:25 -070012909@staticmethod
12910str.maketrans as unicode_maketrans
12911
12912 x: object
12913
12914 y: unicode=NULL
12915
12916 z: unicode=NULL
12917
12918 /
12919
12920Return a translation table usable for str.translate().
12921
12922If there is only one argument, it must be a dictionary mapping Unicode
12923ordinals (integers) or characters to Unicode ordinals, strings or None.
12924Character keys will be then converted to ordinals.
12925If there are two arguments, they must be strings of equal length, and
12926in the resulting dictionary, each character in x will be mapped to the
12927character at the same position in y. If there is a third argument, it
12928must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080012929[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070012930
Larry Hastings31826802013-10-19 00:09:25 -070012931static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080012932unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030012933/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070012934{
Georg Brandlceee0772007-11-27 23:48:05 +000012935 PyObject *new = NULL, *key, *value;
12936 Py_ssize_t i = 0;
12937 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012938
Georg Brandlceee0772007-11-27 23:48:05 +000012939 new = PyDict_New();
12940 if (!new)
12941 return NULL;
12942 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012943 int x_kind, y_kind, z_kind;
12944 void *x_data, *y_data, *z_data;
12945
Georg Brandlceee0772007-11-27 23:48:05 +000012946 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000012947 if (!PyUnicode_Check(x)) {
12948 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12949 "be a string if there is a second argument");
12950 goto err;
12951 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012952 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012953 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12954 "arguments must have equal length");
12955 goto err;
12956 }
12957 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012958 x_kind = PyUnicode_KIND(x);
12959 y_kind = PyUnicode_KIND(y);
12960 x_data = PyUnicode_DATA(x);
12961 y_data = PyUnicode_DATA(y);
12962 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12963 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012964 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000012965 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060012966 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060012967 if (!value) {
12968 Py_DECREF(key);
12969 goto err;
12970 }
Georg Brandlceee0772007-11-27 23:48:05 +000012971 res = PyDict_SetItem(new, key, value);
12972 Py_DECREF(key);
12973 Py_DECREF(value);
12974 if (res < 0)
12975 goto err;
12976 }
12977 /* create entries for deleting chars in z */
12978 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012979 z_kind = PyUnicode_KIND(z);
12980 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020012981 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012982 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000012983 if (!key)
12984 goto err;
12985 res = PyDict_SetItem(new, key, Py_None);
12986 Py_DECREF(key);
12987 if (res < 0)
12988 goto err;
12989 }
12990 }
12991 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012992 int kind;
12993 void *data;
12994
Georg Brandlceee0772007-11-27 23:48:05 +000012995 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000012996 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000012997 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12998 "to maketrans it must be a dict");
12999 goto err;
13000 }
13001 /* copy entries into the new dict, converting string keys to int keys */
13002 while (PyDict_Next(x, &i, &key, &value)) {
13003 if (PyUnicode_Check(key)) {
13004 /* convert string keys to integer keys */
13005 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013006 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013007 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13008 "table must be of length 1");
13009 goto err;
13010 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013011 kind = PyUnicode_KIND(key);
13012 data = PyUnicode_DATA(key);
13013 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013014 if (!newkey)
13015 goto err;
13016 res = PyDict_SetItem(new, newkey, value);
13017 Py_DECREF(newkey);
13018 if (res < 0)
13019 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013020 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013021 /* just keep integer keys */
13022 if (PyDict_SetItem(new, key, value) < 0)
13023 goto err;
13024 } else {
13025 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13026 "be strings or integers");
13027 goto err;
13028 }
13029 }
13030 }
13031 return new;
13032 err:
13033 Py_DECREF(new);
13034 return NULL;
13035}
13036
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013037PyDoc_STRVAR(translate__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013038 "S.translate(table) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013039\n\
Zachary Ware79b98df2015-08-05 23:54:15 -050013040Return a copy of the string S in which each character has been mapped\n\
13041through the given translation table. The table must implement\n\
13042lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13043mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13044this operation raises LookupError, the character is left untouched.\n\
13045Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013046
13047static PyObject*
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013048unicode_translate(PyObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013049{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013050 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013051}
13052
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013053PyDoc_STRVAR(upper__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013054 "S.upper() -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013055\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013056Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013057
13058static PyObject*
Victor Stinner9310abb2011-10-05 00:59:23 +020013059unicode_upper(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013060{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013061 if (PyUnicode_READY(self) == -1)
13062 return NULL;
13063 if (PyUnicode_IS_ASCII(self))
13064 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013065 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013066}
13067
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013068PyDoc_STRVAR(zfill__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013069 "S.zfill(width) -> str\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013070\n\
Benjamin Peterson9aa42992008-09-10 21:57:34 +000013071Pad a numeric string S with zeros on the left, to fill a field\n\
13072of the specified width. The string S is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013073
13074static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020013075unicode_zfill(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013076{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013077 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013078 PyObject *u;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013079 Py_ssize_t width;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013080 int kind;
13081 void *data;
13082 Py_UCS4 chr;
13083
Martin v. Löwis18e16552006-02-15 17:27:45 +000013084 if (!PyArg_ParseTuple(args, "n:zfill", &width))
Guido van Rossumd57fd912000-03-10 22:53:23 +000013085 return NULL;
13086
Benjamin Petersonbac79492012-01-14 13:34:47 -050013087 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013088 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013089
Victor Stinnerc4b49542011-12-11 22:44:26 +010013090 if (PyUnicode_GET_LENGTH(self) >= width)
13091 return unicode_result_unchanged(self);
13092
13093 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013094
13095 u = pad(self, fill, 0, '0');
13096
Walter Dörwald068325e2002-04-15 13:36:47 +000013097 if (u == NULL)
13098 return NULL;
13099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013100 kind = PyUnicode_KIND(u);
13101 data = PyUnicode_DATA(u);
13102 chr = PyUnicode_READ(kind, data, fill);
13103
13104 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013105 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013106 PyUnicode_WRITE(kind, data, 0, chr);
13107 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013108 }
13109
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013110 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013111 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013112}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113
13114#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013115static PyObject *
13116unicode__decimal2ascii(PyObject *self)
13117{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013118 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013119}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013120#endif
13121
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013122PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013123 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013125Return True if S starts with the specified prefix, False otherwise.\n\
13126With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013127With optional end, stop comparing S at that position.\n\
13128prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129
13130static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013131unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013132 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013133{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013134 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013135 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013136 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013137 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013138 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013139
Jesus Ceaac451502011-04-20 17:09:23 +020013140 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013141 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013142 if (PyTuple_Check(subobj)) {
13143 Py_ssize_t i;
13144 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013145 substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013146 if (substring == NULL)
13147 return NULL;
13148 result = tailmatch(self, substring, start, end, -1);
13149 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013150 if (result == -1)
13151 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013152 if (result) {
13153 Py_RETURN_TRUE;
13154 }
13155 }
13156 /* nothing matched */
13157 Py_RETURN_FALSE;
13158 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013159 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013160 if (substring == NULL) {
13161 if (PyErr_ExceptionMatches(PyExc_TypeError))
13162 PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13163 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013164 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013165 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013166 result = tailmatch(self, substring, start, end, -1);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013167 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013168 if (result == -1)
13169 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013170 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013171}
13172
13173
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013174PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013175 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013176\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013177Return True if S ends with the specified suffix, False otherwise.\n\
13178With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013179With optional end, stop comparing S at that position.\n\
13180suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013181
13182static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013183unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013184 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013185{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013186 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013187 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013188 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013189 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013190 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013191
Jesus Ceaac451502011-04-20 17:09:23 +020013192 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013193 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013194 if (PyTuple_Check(subobj)) {
13195 Py_ssize_t i;
13196 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013197 substring = PyUnicode_FromObject(
Benjamin Peterson29060642009-01-31 22:14:21 +000013198 PyTuple_GET_ITEM(subobj, i));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013199 if (substring == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000013200 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013201 result = tailmatch(self, substring, start, end, +1);
13202 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013203 if (result == -1)
13204 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013205 if (result) {
13206 Py_RETURN_TRUE;
13207 }
13208 }
13209 Py_RETURN_FALSE;
13210 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013211 substring = PyUnicode_FromObject(subobj);
Ezio Melottiba42fd52011-04-26 06:09:45 +030013212 if (substring == NULL) {
13213 if (PyErr_ExceptionMatches(PyExc_TypeError))
13214 PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13215 "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013216 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013217 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013218 result = tailmatch(self, substring, start, end, +1);
Christian Heimes305e49e2013-06-29 20:41:06 +020013219 Py_DECREF(substring);
Victor Stinner18aa4472013-01-03 03:18:09 +010013220 if (result == -1)
13221 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013222 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013223}
13224
Victor Stinner202fdca2012-05-07 12:47:02 +020013225Py_LOCAL_INLINE(void)
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013226_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013227{
Victor Stinner8f674cc2013-04-17 23:02:17 +020013228 if (!writer->readonly)
13229 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13230 else {
13231 /* Copy-on-write mode: set buffer size to 0 so
13232 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13233 * next write. */
13234 writer->size = 0;
13235 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013236 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13237 writer->data = PyUnicode_DATA(writer->buffer);
13238 writer->kind = PyUnicode_KIND(writer->buffer);
13239}
13240
Victor Stinnerd3f08822012-05-29 12:57:52 +020013241void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013242_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013243{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013244 memset(writer, 0, sizeof(*writer));
13245#ifdef Py_DEBUG
13246 writer->kind = 5; /* invalid kind */
13247#endif
Victor Stinner8f674cc2013-04-17 23:02:17 +020013248 writer->min_char = 127;
Victor Stinner202fdca2012-05-07 12:47:02 +020013249}
13250
Victor Stinnerd3f08822012-05-29 12:57:52 +020013251int
13252_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13253 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013254{
Victor Stinner6989ba02013-11-18 21:08:39 +010013255#ifdef MS_WINDOWS
13256 /* On Windows, overallocate by 50% is the best factor */
13257# define OVERALLOCATE_FACTOR 2
13258#else
13259 /* On Linux, overallocate by 25% is the best factor */
13260# define OVERALLOCATE_FACTOR 4
13261#endif
Victor Stinner202fdca2012-05-07 12:47:02 +020013262 Py_ssize_t newlen;
13263 PyObject *newbuffer;
13264
Victor Stinnerd3f08822012-05-29 12:57:52 +020013265 assert(length > 0);
13266
Victor Stinner202fdca2012-05-07 12:47:02 +020013267 if (length > PY_SSIZE_T_MAX - writer->pos) {
13268 PyErr_NoMemory();
13269 return -1;
13270 }
13271 newlen = writer->pos + length;
13272
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013273 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013274
Victor Stinnerd3f08822012-05-29 12:57:52 +020013275 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013276 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013277 if (writer->overallocate
13278 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13279 /* overallocate to limit the number of realloc() */
13280 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013281 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013282 if (newlen < writer->min_length)
13283 newlen = writer->min_length;
13284
Victor Stinnerd3f08822012-05-29 12:57:52 +020013285 writer->buffer = PyUnicode_New(newlen, maxchar);
13286 if (writer->buffer == NULL)
13287 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013288 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013289 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013290 if (writer->overallocate
13291 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13292 /* overallocate to limit the number of realloc() */
13293 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013294 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013295 if (newlen < writer->min_length)
13296 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013297
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013298 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013299 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013300 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013301 newbuffer = PyUnicode_New(newlen, maxchar);
13302 if (newbuffer == NULL)
13303 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013304 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13305 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013306 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013307 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013308 }
13309 else {
13310 newbuffer = resize_compact(writer->buffer, newlen);
13311 if (newbuffer == NULL)
13312 return -1;
13313 }
13314 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013315 }
13316 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013317 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013318 newbuffer = PyUnicode_New(writer->size, maxchar);
13319 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013320 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013321 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13322 writer->buffer, 0, writer->pos);
Serhiy Storchaka5a57ade2015-12-24 10:35:59 +020013323 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013324 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013325 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013326 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013327
13328#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013329}
13330
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013331Py_LOCAL_INLINE(int)
13332_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013333{
13334 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13335 return -1;
13336 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13337 writer->pos++;
13338 return 0;
13339}
13340
13341int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013342_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13343{
13344 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13345}
13346
13347int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013348_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13349{
13350 Py_UCS4 maxchar;
13351 Py_ssize_t len;
13352
13353 if (PyUnicode_READY(str) == -1)
13354 return -1;
13355 len = PyUnicode_GET_LENGTH(str);
13356 if (len == 0)
13357 return 0;
13358 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13359 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013360 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013361 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013362 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013363 Py_INCREF(str);
13364 writer->buffer = str;
13365 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013366 writer->pos += len;
13367 return 0;
13368 }
13369 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13370 return -1;
13371 }
13372 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13373 str, 0, len);
13374 writer->pos += len;
13375 return 0;
13376}
13377
Victor Stinnere215d962012-10-06 23:03:36 +020013378int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013379_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13380 Py_ssize_t start, Py_ssize_t end)
13381{
13382 Py_UCS4 maxchar;
13383 Py_ssize_t len;
13384
13385 if (PyUnicode_READY(str) == -1)
13386 return -1;
13387
13388 assert(0 <= start);
13389 assert(end <= PyUnicode_GET_LENGTH(str));
13390 assert(start <= end);
13391
13392 if (end == 0)
13393 return 0;
13394
13395 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13396 return _PyUnicodeWriter_WriteStr(writer, str);
13397
13398 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13399 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13400 else
13401 maxchar = writer->maxchar;
13402 len = end - start;
13403
13404 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13405 return -1;
13406
13407 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13408 str, start, len);
13409 writer->pos += len;
13410 return 0;
13411}
13412
13413int
Victor Stinner4a587072013-11-19 12:54:53 +010013414_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13415 const char *ascii, Py_ssize_t len)
13416{
13417 if (len == -1)
13418 len = strlen(ascii);
13419
13420 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13421
13422 if (writer->buffer == NULL && !writer->overallocate) {
13423 PyObject *str;
13424
13425 str = _PyUnicode_FromASCII(ascii, len);
13426 if (str == NULL)
13427 return -1;
13428
13429 writer->readonly = 1;
13430 writer->buffer = str;
13431 _PyUnicodeWriter_Update(writer);
13432 writer->pos += len;
13433 return 0;
13434 }
13435
13436 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13437 return -1;
13438
13439 switch (writer->kind)
13440 {
13441 case PyUnicode_1BYTE_KIND:
13442 {
13443 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13444 Py_UCS1 *data = writer->data;
13445
13446 Py_MEMCPY(data + writer->pos, str, len);
13447 break;
13448 }
13449 case PyUnicode_2BYTE_KIND:
13450 {
13451 _PyUnicode_CONVERT_BYTES(
13452 Py_UCS1, Py_UCS2,
13453 ascii, ascii + len,
13454 (Py_UCS2 *)writer->data + writer->pos);
13455 break;
13456 }
13457 case PyUnicode_4BYTE_KIND:
13458 {
13459 _PyUnicode_CONVERT_BYTES(
13460 Py_UCS1, Py_UCS4,
13461 ascii, ascii + len,
13462 (Py_UCS4 *)writer->data + writer->pos);
13463 break;
13464 }
13465 default:
13466 assert(0);
13467 }
13468
13469 writer->pos += len;
13470 return 0;
13471}
13472
13473int
13474_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13475 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013476{
13477 Py_UCS4 maxchar;
13478
13479 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13480 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13481 return -1;
13482 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13483 writer->pos += len;
13484 return 0;
13485}
13486
Victor Stinnerd3f08822012-05-29 12:57:52 +020013487PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013488_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013489{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013490 PyObject *str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013491 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013492 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013493 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013494 }
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013495 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013496 str = writer->buffer;
13497 writer->buffer = NULL;
13498 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13499 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013500 }
13501 if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13502 PyObject *newbuffer;
13503 newbuffer = resize_compact(writer->buffer, writer->pos);
13504 if (newbuffer == NULL) {
Serhiy Storchakadfe98a12014-02-09 13:46:20 +020013505 Py_CLEAR(writer->buffer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013506 return NULL;
13507 }
13508 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013509 }
Victor Stinner15a0bd32013-07-08 22:29:55 +020013510 str = writer->buffer;
13511 writer->buffer = NULL;
13512 assert(_PyUnicode_CheckConsistency(str, 1));
13513 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013514}
13515
Victor Stinnerd3f08822012-05-29 12:57:52 +020013516void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013517_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013518{
13519 Py_CLEAR(writer->buffer);
13520}
13521
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013522#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013523
13524PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013525 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013526\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013527Return a formatted version of S, using substitutions from args and kwargs.\n\
13528The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013529
Eric Smith27bbca62010-11-04 17:06:58 +000013530PyDoc_STRVAR(format_map__doc__,
13531 "S.format_map(mapping) -> str\n\
13532\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013533Return a formatted version of S, using substitutions from mapping.\n\
13534The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013535
Eric Smith4a7d76d2008-05-30 18:10:19 +000013536static PyObject *
13537unicode__format__(PyObject* self, PyObject* args)
13538{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013539 PyObject *format_spec;
13540 _PyUnicodeWriter writer;
13541 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013542
13543 if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13544 return NULL;
13545
Victor Stinnerd3f08822012-05-29 12:57:52 +020013546 if (PyUnicode_READY(self) == -1)
13547 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013548 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013549 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13550 self, format_spec, 0,
13551 PyUnicode_GET_LENGTH(format_spec));
13552 if (ret == -1) {
13553 _PyUnicodeWriter_Dealloc(&writer);
13554 return NULL;
13555 }
13556 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013557}
13558
Eric Smith8c663262007-08-25 02:26:07 +000013559PyDoc_STRVAR(p_format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013560 "S.__format__(format_spec) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013561\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013562Return a formatted version of S as described by format_spec.");
Eric Smith8c663262007-08-25 02:26:07 +000013563
13564static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013565unicode__sizeof__(PyObject *v)
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013566{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013567 Py_ssize_t size;
13568
13569 /* If it's a compact object, account for base structure +
13570 character data. */
13571 if (PyUnicode_IS_COMPACT_ASCII(v))
13572 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13573 else if (PyUnicode_IS_COMPACT(v))
13574 size = sizeof(PyCompactUnicodeObject) +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013575 (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013576 else {
13577 /* If it is a two-block object, account for base object, and
13578 for character block if present. */
13579 size = sizeof(PyUnicodeObject);
Victor Stinnerc3c74152011-10-02 20:39:55 +020013580 if (_PyUnicode_DATA_ANY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013581 size += (PyUnicode_GET_LENGTH(v) + 1) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020013582 PyUnicode_KIND(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013583 }
13584 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013585 with the data pointer. Check if the data is not shared. */
Victor Stinner03490912011-10-03 23:45:12 +020013586 if (_PyUnicode_HAS_WSTR_MEMORY(v))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013587 size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
Victor Stinner829c0ad2011-10-03 01:08:02 +020013588 if (_PyUnicode_HAS_UTF8_MEMORY(v))
Victor Stinnere90fe6a2011-10-01 16:48:13 +020013589 size += PyUnicode_UTF8_LENGTH(v) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013590
13591 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013592}
13593
13594PyDoc_STRVAR(sizeof__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013595 "S.__sizeof__() -> size of S in memory, in bytes");
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013596
13597static PyObject *
Victor Stinner034f6cf2011-09-30 02:26:44 +020013598unicode_getnewargs(PyObject *v)
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013599{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013600 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013601 if (!copy)
13602 return NULL;
13603 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013604}
13605
Guido van Rossumd57fd912000-03-10 22:53:23 +000013606static PyMethodDef unicode_methods[] = {
Benjamin Peterson28a4dce2010-12-12 01:33:04 +000013607 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013608 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
Ezio Melotticda6b6d2012-02-26 09:39:55 +020013609 {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13610 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013611 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13612 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
Benjamin Petersond5890c82012-01-14 13:23:30 -050013613 {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013614 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13615 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13616 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013617 {"expandtabs", (PyCFunction) unicode_expandtabs,
13618 METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013619 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013620 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013621 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13622 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13623 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013624 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013625 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13626 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13627 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013628 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Thomas Wouters477c8d52006-05-27 19:21:47 +000013629 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
Ezio Melotti745d54d2013-11-16 19:10:57 +020013630 {"splitlines", (PyCFunction) unicode_splitlines,
13631 METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013632 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013633 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13634 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13635 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13636 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13637 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13638 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13639 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13640 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13641 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13642 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13643 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13644 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13645 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13646 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwis47383402007-08-15 07:32:56 +000013647 {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
Georg Brandl559e5d72008-06-11 18:37:52 +000013648 {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013649 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Eric Smith9cd1e092007-08-31 18:39:38 +000013650 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013651 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
Eric Smith4a7d76d2008-05-30 18:10:19 +000013652 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
Larry Hastings31826802013-10-19 00:09:25 -070013653 UNICODE_MAKETRANS_METHODDEF
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013654 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +000013655#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013656 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013657 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013658#endif
13659
Benjamin Peterson14339b62009-01-31 16:36:08 +000013660 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013661 {NULL, NULL}
13662};
13663
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013664static PyObject *
13665unicode_mod(PyObject *v, PyObject *w)
13666{
Brian Curtindfc80e32011-08-10 20:28:54 -050013667 if (!PyUnicode_Check(v))
13668 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013669 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013670}
13671
13672static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013673 0, /*nb_add*/
13674 0, /*nb_subtract*/
13675 0, /*nb_multiply*/
13676 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013677};
13678
Guido van Rossumd57fd912000-03-10 22:53:23 +000013679static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013680 (lenfunc) unicode_length, /* sq_length */
13681 PyUnicode_Concat, /* sq_concat */
13682 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13683 (ssizeargfunc) unicode_getitem, /* sq_item */
13684 0, /* sq_slice */
13685 0, /* sq_ass_item */
13686 0, /* sq_ass_slice */
13687 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013688};
13689
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013690static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013691unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013692{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013693 if (PyUnicode_READY(self) == -1)
13694 return NULL;
13695
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013696 if (PyIndex_Check(item)) {
13697 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013698 if (i == -1 && PyErr_Occurred())
13699 return NULL;
13700 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013701 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013702 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013703 } else if (PySlice_Check(item)) {
Martin v. Löwis18e16552006-02-15 17:27:45 +000013704 Py_ssize_t start, stop, step, slicelength, cur, i;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013705 PyObject *result;
13706 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013707 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013708 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013709
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013710 if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
Benjamin Peterson29060642009-01-31 22:14:21 +000013711 &start, &stop, &step, &slicelength) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013712 return NULL;
13713 }
13714
13715 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020013716 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013717 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010013718 slicelength == PyUnicode_GET_LENGTH(self)) {
13719 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000013720 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013721 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020013722 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013723 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013724 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013725 src_kind = PyUnicode_KIND(self);
13726 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020013727 if (!PyUnicode_IS_ASCII(self)) {
13728 kind_limit = kind_maxchar_limit(src_kind);
13729 max_char = 0;
13730 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13731 ch = PyUnicode_READ(src_kind, src_data, cur);
13732 if (ch > max_char) {
13733 max_char = ch;
13734 if (max_char >= kind_limit)
13735 break;
13736 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013737 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013738 }
Victor Stinner55c99112011-10-13 01:17:06 +020013739 else
13740 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013741 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013742 if (result == NULL)
13743 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013744 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013745 dest_data = PyUnicode_DATA(result);
13746
13747 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013748 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13749 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013750 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013751 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013752 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013753 } else {
13754 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13755 return NULL;
13756 }
13757}
13758
13759static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013760 (lenfunc)unicode_length, /* mp_length */
13761 (binaryfunc)unicode_subscript, /* mp_subscript */
13762 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013763};
13764
Guido van Rossumd57fd912000-03-10 22:53:23 +000013765
Guido van Rossumd57fd912000-03-10 22:53:23 +000013766/* Helpers for PyUnicode_Format() */
13767
Victor Stinnera47082312012-10-04 02:19:54 +020013768struct unicode_formatter_t {
13769 PyObject *args;
13770 int args_owned;
13771 Py_ssize_t arglen, argidx;
13772 PyObject *dict;
13773
13774 enum PyUnicode_Kind fmtkind;
13775 Py_ssize_t fmtcnt, fmtpos;
13776 void *fmtdata;
13777 PyObject *fmtstr;
13778
13779 _PyUnicodeWriter writer;
13780};
13781
13782struct unicode_format_arg_t {
13783 Py_UCS4 ch;
13784 int flags;
13785 Py_ssize_t width;
13786 int prec;
13787 int sign;
13788};
13789
Guido van Rossumd57fd912000-03-10 22:53:23 +000013790static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020013791unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013792{
Victor Stinnera47082312012-10-04 02:19:54 +020013793 Py_ssize_t argidx = ctx->argidx;
13794
13795 if (argidx < ctx->arglen) {
13796 ctx->argidx++;
13797 if (ctx->arglen < 0)
13798 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000013799 else
Victor Stinnera47082312012-10-04 02:19:54 +020013800 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013801 }
13802 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000013803 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013804 return NULL;
13805}
13806
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013807/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013808
Victor Stinnera47082312012-10-04 02:19:54 +020013809/* Format a float into the writer if the writer is not NULL, or into *p_output
13810 otherwise.
13811
13812 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020013813static int
Victor Stinnera47082312012-10-04 02:19:54 +020013814formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13815 PyObject **p_output,
13816 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013817{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013818 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013819 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013820 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020013821 int prec;
13822 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000013823
Guido van Rossumd57fd912000-03-10 22:53:23 +000013824 x = PyFloat_AsDouble(v);
13825 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020013826 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013827
Victor Stinnera47082312012-10-04 02:19:54 +020013828 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013829 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013830 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000013831
Victor Stinnera47082312012-10-04 02:19:54 +020013832 if (arg->flags & F_ALT)
13833 dtoa_flags = Py_DTSF_ALT;
13834 else
13835 dtoa_flags = 0;
13836 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000013837 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020013838 return -1;
13839 len = strlen(p);
13840 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010013841 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020013842 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013843 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020013844 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020013845 }
13846 else
13847 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000013848 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013849 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013850}
13851
Victor Stinnerd0880d52012-04-27 23:40:13 +020013852/* formatlong() emulates the format codes d, u, o, x and X, and
13853 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
13854 * Python's regular ints.
13855 * Return value: a new PyUnicodeObject*, or NULL if error.
13856 * The output string is of the form
13857 * "-"? ("0x" | "0X")? digit+
13858 * "0x"/"0X" are present only for x and X conversions, with F_ALT
13859 * set in flags. The case of hex digits will be correct,
13860 * There will be at least prec digits, zero-filled on the left if
13861 * necessary to get that many.
13862 * val object to be converted
13863 * flags bitmask of format flags; only F_ALT is looked at
13864 * prec minimum number of digits; 0-fill on left if needed
13865 * type a character in [duoxX]; u acts the same as d
13866 *
13867 * CAUTION: o, x and X conversions on regular ints can never
13868 * produce a '-' sign, but can for Python's unbounded ints.
13869 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013870PyObject *
13871_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000013872{
Victor Stinnerd0880d52012-04-27 23:40:13 +020013873 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013874 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013875 Py_ssize_t i;
13876 int sign; /* 1 if '-', else 0 */
13877 int len; /* number of characters */
13878 Py_ssize_t llen;
13879 int numdigits; /* len == numnondigits + numdigits */
13880 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000013881
Victor Stinnerd0880d52012-04-27 23:40:13 +020013882 /* Avoid exceeding SSIZE_T_MAX */
13883 if (prec > INT_MAX-3) {
13884 PyErr_SetString(PyExc_OverflowError,
13885 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000013886 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013887 }
13888
13889 assert(PyLong_Check(val));
13890
13891 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020013892 default:
13893 assert(!"'type' not in [diuoxX]");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013894 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020013895 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020013896 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070013897 /* int and int subclasses should print numerically when a numeric */
13898 /* format code is used (see issue18780) */
13899 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013900 break;
13901 case 'o':
13902 numnondigits = 2;
13903 result = PyNumber_ToBase(val, 8);
13904 break;
13905 case 'x':
13906 case 'X':
13907 numnondigits = 2;
13908 result = PyNumber_ToBase(val, 16);
13909 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020013910 }
13911 if (!result)
13912 return NULL;
13913
13914 assert(unicode_modifiable(result));
13915 assert(PyUnicode_IS_READY(result));
13916 assert(PyUnicode_IS_ASCII(result));
13917
13918 /* To modify the string in-place, there can only be one reference. */
13919 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013920 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013921 PyErr_BadInternalCall();
13922 return NULL;
13923 }
13924 buf = PyUnicode_DATA(result);
13925 llen = PyUnicode_GET_LENGTH(result);
13926 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020013927 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013928 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080013929 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020013930 return NULL;
13931 }
13932 len = (int)llen;
13933 sign = buf[0] == '-';
13934 numnondigits += sign;
13935 numdigits = len - numnondigits;
13936 assert(numdigits > 0);
13937
13938 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080013939 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020013940 (type == 'o' || type == 'x' || type == 'X'))) {
13941 assert(buf[sign] == '0');
13942 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13943 buf[sign+1] == 'o');
13944 numnondigits -= 2;
13945 buf += 2;
13946 len -= 2;
13947 if (sign)
13948 buf[0] = '-';
13949 assert(len == numnondigits + numdigits);
13950 assert(numdigits > 0);
13951 }
13952
13953 /* Fill with leading zeroes to meet minimum width. */
13954 if (prec > numdigits) {
13955 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13956 numnondigits + prec);
13957 char *b1;
13958 if (!r1) {
13959 Py_DECREF(result);
13960 return NULL;
13961 }
13962 b1 = PyBytes_AS_STRING(r1);
13963 for (i = 0; i < numnondigits; ++i)
13964 *b1++ = *buf++;
13965 for (i = 0; i < prec - numdigits; i++)
13966 *b1++ = '0';
13967 for (i = 0; i < numdigits; i++)
13968 *b1++ = *buf++;
13969 *b1 = '\0';
13970 Py_DECREF(result);
13971 result = r1;
13972 buf = PyBytes_AS_STRING(result);
13973 len = numnondigits + prec;
13974 }
13975
13976 /* Fix up case for hex conversions. */
13977 if (type == 'X') {
13978 /* Need to convert all lower case letters to upper case.
13979 and need to convert 0x to 0X (and -0x to -0X). */
13980 for (i = 0; i < len; i++)
13981 if (buf[i] >= 'a' && buf[i] <= 'x')
13982 buf[i] -= 'a'-'A';
13983 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013984 if (!PyUnicode_Check(result)
13985 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020013986 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013987 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020013988 Py_DECREF(result);
13989 result = unicode;
13990 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020013991 else if (len != PyUnicode_GET_LENGTH(result)) {
13992 if (PyUnicode_Resize(&result, len) < 0)
13993 Py_CLEAR(result);
13994 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000013995 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000013996}
13997
Ethan Furmandf3ed242014-01-05 06:50:30 -080013998/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020013999 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014000 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014001 * -1 and raise an exception on error */
14002static int
Victor Stinnera47082312012-10-04 02:19:54 +020014003mainformatlong(PyObject *v,
14004 struct unicode_format_arg_t *arg,
14005 PyObject **p_output,
14006 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014007{
14008 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014009 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014010
14011 if (!PyNumber_Check(v))
14012 goto wrongtype;
14013
Ethan Furman9ab74802014-03-21 06:38:46 -070014014 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014015 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014016 if (type == 'o' || type == 'x' || type == 'X') {
14017 iobj = PyNumber_Index(v);
14018 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014019 if (PyErr_ExceptionMatches(PyExc_TypeError))
14020 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014021 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014022 }
14023 }
14024 else {
14025 iobj = PyNumber_Long(v);
14026 if (iobj == NULL ) {
14027 if (PyErr_ExceptionMatches(PyExc_TypeError))
14028 goto wrongtype;
14029 return -1;
14030 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014031 }
14032 assert(PyLong_Check(iobj));
14033 }
14034 else {
14035 iobj = v;
14036 Py_INCREF(iobj);
14037 }
14038
14039 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014040 && arg->width == -1 && arg->prec == -1
14041 && !(arg->flags & (F_SIGN | F_BLANK))
14042 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014043 {
14044 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014045 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014046 int base;
14047
Victor Stinnera47082312012-10-04 02:19:54 +020014048 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014049 {
14050 default:
14051 assert(0 && "'type' not in [diuoxX]");
14052 case 'd':
14053 case 'i':
14054 case 'u':
14055 base = 10;
14056 break;
14057 case 'o':
14058 base = 8;
14059 break;
14060 case 'x':
14061 case 'X':
14062 base = 16;
14063 break;
14064 }
14065
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014066 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14067 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014068 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014069 }
14070 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014071 return 1;
14072 }
14073
Ethan Furmanb95b5612015-01-23 20:05:18 -080014074 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014075 Py_DECREF(iobj);
14076 if (res == NULL)
14077 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014078 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014079 return 0;
14080
14081wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014082 switch(type)
14083 {
14084 case 'o':
14085 case 'x':
14086 case 'X':
14087 PyErr_Format(PyExc_TypeError,
14088 "%%%c format: an integer is required, "
14089 "not %.200s",
14090 type, Py_TYPE(v)->tp_name);
14091 break;
14092 default:
14093 PyErr_Format(PyExc_TypeError,
14094 "%%%c format: a number is required, "
14095 "not %.200s",
14096 type, Py_TYPE(v)->tp_name);
14097 break;
14098 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014099 return -1;
14100}
14101
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014102static Py_UCS4
14103formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014104{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014105 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014106 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014107 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014108 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014109 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014110 goto onError;
14111 }
14112 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014113 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014114 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014115 /* make sure number is a type of integer */
14116 if (!PyLong_Check(v)) {
14117 iobj = PyNumber_Index(v);
14118 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014119 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014120 }
14121 v = iobj;
14122 Py_DECREF(iobj);
14123 }
14124 /* Integer input truncated to a character */
Benjamin Peterson29060642009-01-31 22:14:21 +000014125 x = PyLong_AsLong(v);
14126 if (x == -1 && PyErr_Occurred())
14127 goto onError;
14128
Victor Stinner8faf8212011-12-08 22:14:11 +010014129 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014130 PyErr_SetString(PyExc_OverflowError,
14131 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014132 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014133 }
14134
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014135 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014136 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014137
Benjamin Peterson29060642009-01-31 22:14:21 +000014138 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014139 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014140 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014141 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014142}
14143
Victor Stinnera47082312012-10-04 02:19:54 +020014144/* Parse options of an argument: flags, width, precision.
14145 Handle also "%(name)" syntax.
14146
14147 Return 0 if the argument has been formatted into arg->str.
14148 Return 1 if the argument has been written into ctx->writer,
14149 Raise an exception and return -1 on error. */
14150static int
14151unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14152 struct unicode_format_arg_t *arg)
14153{
14154#define FORMAT_READ(ctx) \
14155 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14156
14157 PyObject *v;
14158
Victor Stinnera47082312012-10-04 02:19:54 +020014159 if (arg->ch == '(') {
14160 /* Get argument value from a dictionary. Example: "%(name)s". */
14161 Py_ssize_t keystart;
14162 Py_ssize_t keylen;
14163 PyObject *key;
14164 int pcount = 1;
14165
14166 if (ctx->dict == NULL) {
14167 PyErr_SetString(PyExc_TypeError,
14168 "format requires a mapping");
14169 return -1;
14170 }
14171 ++ctx->fmtpos;
14172 --ctx->fmtcnt;
14173 keystart = ctx->fmtpos;
14174 /* Skip over balanced parentheses */
14175 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14176 arg->ch = FORMAT_READ(ctx);
14177 if (arg->ch == ')')
14178 --pcount;
14179 else if (arg->ch == '(')
14180 ++pcount;
14181 ctx->fmtpos++;
14182 }
14183 keylen = ctx->fmtpos - keystart - 1;
14184 if (ctx->fmtcnt < 0 || pcount > 0) {
14185 PyErr_SetString(PyExc_ValueError,
14186 "incomplete format key");
14187 return -1;
14188 }
14189 key = PyUnicode_Substring(ctx->fmtstr,
14190 keystart, keystart + keylen);
14191 if (key == NULL)
14192 return -1;
14193 if (ctx->args_owned) {
14194 Py_DECREF(ctx->args);
14195 ctx->args_owned = 0;
14196 }
14197 ctx->args = PyObject_GetItem(ctx->dict, key);
14198 Py_DECREF(key);
14199 if (ctx->args == NULL)
14200 return -1;
14201 ctx->args_owned = 1;
14202 ctx->arglen = -1;
14203 ctx->argidx = -2;
14204 }
14205
14206 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014207 while (--ctx->fmtcnt >= 0) {
14208 arg->ch = FORMAT_READ(ctx);
14209 ctx->fmtpos++;
14210 switch (arg->ch) {
14211 case '-': arg->flags |= F_LJUST; continue;
14212 case '+': arg->flags |= F_SIGN; continue;
14213 case ' ': arg->flags |= F_BLANK; continue;
14214 case '#': arg->flags |= F_ALT; continue;
14215 case '0': arg->flags |= F_ZERO; continue;
14216 }
14217 break;
14218 }
14219
14220 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014221 if (arg->ch == '*') {
14222 v = unicode_format_getnextarg(ctx);
14223 if (v == NULL)
14224 return -1;
14225 if (!PyLong_Check(v)) {
14226 PyErr_SetString(PyExc_TypeError,
14227 "* wants int");
14228 return -1;
14229 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014230 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014231 if (arg->width == -1 && PyErr_Occurred())
14232 return -1;
14233 if (arg->width < 0) {
14234 arg->flags |= F_LJUST;
14235 arg->width = -arg->width;
14236 }
14237 if (--ctx->fmtcnt >= 0) {
14238 arg->ch = FORMAT_READ(ctx);
14239 ctx->fmtpos++;
14240 }
14241 }
14242 else if (arg->ch >= '0' && arg->ch <= '9') {
14243 arg->width = arg->ch - '0';
14244 while (--ctx->fmtcnt >= 0) {
14245 arg->ch = FORMAT_READ(ctx);
14246 ctx->fmtpos++;
14247 if (arg->ch < '0' || arg->ch > '9')
14248 break;
14249 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14250 mixing signed and unsigned comparison. Since arg->ch is between
14251 '0' and '9', casting to int is safe. */
14252 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14253 PyErr_SetString(PyExc_ValueError,
14254 "width too big");
14255 return -1;
14256 }
14257 arg->width = arg->width*10 + (arg->ch - '0');
14258 }
14259 }
14260
14261 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014262 if (arg->ch == '.') {
14263 arg->prec = 0;
14264 if (--ctx->fmtcnt >= 0) {
14265 arg->ch = FORMAT_READ(ctx);
14266 ctx->fmtpos++;
14267 }
14268 if (arg->ch == '*') {
14269 v = unicode_format_getnextarg(ctx);
14270 if (v == NULL)
14271 return -1;
14272 if (!PyLong_Check(v)) {
14273 PyErr_SetString(PyExc_TypeError,
14274 "* wants int");
14275 return -1;
14276 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014277 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014278 if (arg->prec == -1 && PyErr_Occurred())
14279 return -1;
14280 if (arg->prec < 0)
14281 arg->prec = 0;
14282 if (--ctx->fmtcnt >= 0) {
14283 arg->ch = FORMAT_READ(ctx);
14284 ctx->fmtpos++;
14285 }
14286 }
14287 else if (arg->ch >= '0' && arg->ch <= '9') {
14288 arg->prec = arg->ch - '0';
14289 while (--ctx->fmtcnt >= 0) {
14290 arg->ch = FORMAT_READ(ctx);
14291 ctx->fmtpos++;
14292 if (arg->ch < '0' || arg->ch > '9')
14293 break;
14294 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14295 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014296 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014297 return -1;
14298 }
14299 arg->prec = arg->prec*10 + (arg->ch - '0');
14300 }
14301 }
14302 }
14303
14304 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14305 if (ctx->fmtcnt >= 0) {
14306 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14307 if (--ctx->fmtcnt >= 0) {
14308 arg->ch = FORMAT_READ(ctx);
14309 ctx->fmtpos++;
14310 }
14311 }
14312 }
14313 if (ctx->fmtcnt < 0) {
14314 PyErr_SetString(PyExc_ValueError,
14315 "incomplete format");
14316 return -1;
14317 }
14318 return 0;
14319
14320#undef FORMAT_READ
14321}
14322
14323/* Format one argument. Supported conversion specifiers:
14324
14325 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014326 - "i", "d", "u": int or float
14327 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014328 - "e", "E", "f", "F", "g", "G": float
14329 - "c": int or str (1 character)
14330
Victor Stinner8dbd4212012-12-04 09:30:24 +010014331 When possible, the output is written directly into the Unicode writer
14332 (ctx->writer). A string is created when padding is required.
14333
Victor Stinnera47082312012-10-04 02:19:54 +020014334 Return 0 if the argument has been formatted into *p_str,
14335 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014336 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014337static int
14338unicode_format_arg_format(struct unicode_formatter_t *ctx,
14339 struct unicode_format_arg_t *arg,
14340 PyObject **p_str)
14341{
14342 PyObject *v;
14343 _PyUnicodeWriter *writer = &ctx->writer;
14344
14345 if (ctx->fmtcnt == 0)
14346 ctx->writer.overallocate = 0;
14347
14348 if (arg->ch == '%') {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014349 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014350 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014351 return 1;
14352 }
14353
14354 v = unicode_format_getnextarg(ctx);
14355 if (v == NULL)
14356 return -1;
14357
Victor Stinnera47082312012-10-04 02:19:54 +020014358
14359 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014360 case 's':
14361 case 'r':
14362 case 'a':
14363 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14364 /* Fast path */
14365 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14366 return -1;
14367 return 1;
14368 }
14369
14370 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14371 *p_str = v;
14372 Py_INCREF(*p_str);
14373 }
14374 else {
14375 if (arg->ch == 's')
14376 *p_str = PyObject_Str(v);
14377 else if (arg->ch == 'r')
14378 *p_str = PyObject_Repr(v);
14379 else
14380 *p_str = PyObject_ASCII(v);
14381 }
14382 break;
14383
14384 case 'i':
14385 case 'd':
14386 case 'u':
14387 case 'o':
14388 case 'x':
14389 case 'X':
14390 {
14391 int ret = mainformatlong(v, arg, p_str, writer);
14392 if (ret != 0)
14393 return ret;
14394 arg->sign = 1;
14395 break;
14396 }
14397
14398 case 'e':
14399 case 'E':
14400 case 'f':
14401 case 'F':
14402 case 'g':
14403 case 'G':
14404 if (arg->width == -1 && arg->prec == -1
14405 && !(arg->flags & (F_SIGN | F_BLANK)))
14406 {
14407 /* Fast path */
14408 if (formatfloat(v, arg, NULL, writer) == -1)
14409 return -1;
14410 return 1;
14411 }
14412
14413 arg->sign = 1;
14414 if (formatfloat(v, arg, p_str, NULL) == -1)
14415 return -1;
14416 break;
14417
14418 case 'c':
14419 {
14420 Py_UCS4 ch = formatchar(v);
14421 if (ch == (Py_UCS4) -1)
14422 return -1;
14423 if (arg->width == -1 && arg->prec == -1) {
14424 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014425 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014426 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014427 return 1;
14428 }
14429 *p_str = PyUnicode_FromOrdinal(ch);
14430 break;
14431 }
14432
14433 default:
14434 PyErr_Format(PyExc_ValueError,
14435 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014436 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014437 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14438 (int)arg->ch,
14439 ctx->fmtpos - 1);
14440 return -1;
14441 }
14442 if (*p_str == NULL)
14443 return -1;
14444 assert (PyUnicode_Check(*p_str));
14445 return 0;
14446}
14447
14448static int
14449unicode_format_arg_output(struct unicode_formatter_t *ctx,
14450 struct unicode_format_arg_t *arg,
14451 PyObject *str)
14452{
14453 Py_ssize_t len;
14454 enum PyUnicode_Kind kind;
14455 void *pbuf;
14456 Py_ssize_t pindex;
14457 Py_UCS4 signchar;
14458 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014459 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014460 Py_ssize_t sublen;
14461 _PyUnicodeWriter *writer = &ctx->writer;
14462 Py_UCS4 fill;
14463
14464 fill = ' ';
14465 if (arg->sign && arg->flags & F_ZERO)
14466 fill = '0';
14467
14468 if (PyUnicode_READY(str) == -1)
14469 return -1;
14470
14471 len = PyUnicode_GET_LENGTH(str);
14472 if ((arg->width == -1 || arg->width <= len)
14473 && (arg->prec == -1 || arg->prec >= len)
14474 && !(arg->flags & (F_SIGN | F_BLANK)))
14475 {
14476 /* Fast path */
14477 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14478 return -1;
14479 return 0;
14480 }
14481
14482 /* Truncate the string for "s", "r" and "a" formats
14483 if the precision is set */
14484 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14485 if (arg->prec >= 0 && len > arg->prec)
14486 len = arg->prec;
14487 }
14488
14489 /* Adjust sign and width */
14490 kind = PyUnicode_KIND(str);
14491 pbuf = PyUnicode_DATA(str);
14492 pindex = 0;
14493 signchar = '\0';
14494 if (arg->sign) {
14495 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14496 if (ch == '-' || ch == '+') {
14497 signchar = ch;
14498 len--;
14499 pindex++;
14500 }
14501 else if (arg->flags & F_SIGN)
14502 signchar = '+';
14503 else if (arg->flags & F_BLANK)
14504 signchar = ' ';
14505 else
14506 arg->sign = 0;
14507 }
14508 if (arg->width < len)
14509 arg->width = len;
14510
14511 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014512 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014513 if (!(arg->flags & F_LJUST)) {
14514 if (arg->sign) {
14515 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014516 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014517 }
14518 else {
14519 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014520 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014521 }
14522 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014523 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14524 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014525 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014526 }
14527
Victor Stinnera47082312012-10-04 02:19:54 +020014528 buflen = arg->width;
14529 if (arg->sign && len == arg->width)
14530 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014531 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014532 return -1;
14533
14534 /* Write the sign if needed */
14535 if (arg->sign) {
14536 if (fill != ' ') {
14537 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14538 writer->pos += 1;
14539 }
14540 if (arg->width > len)
14541 arg->width--;
14542 }
14543
14544 /* Write the numeric prefix for "x", "X" and "o" formats
14545 if the alternate form is used.
14546 For example, write "0x" for the "%#x" format. */
14547 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14548 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14549 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14550 if (fill != ' ') {
14551 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14552 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14553 writer->pos += 2;
14554 pindex += 2;
14555 }
14556 arg->width -= 2;
14557 if (arg->width < 0)
14558 arg->width = 0;
14559 len -= 2;
14560 }
14561
14562 /* Pad left with the fill character if needed */
14563 if (arg->width > len && !(arg->flags & F_LJUST)) {
14564 sublen = arg->width - len;
14565 FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14566 writer->pos += sublen;
14567 arg->width = len;
14568 }
14569
14570 /* If padding with spaces: write sign if needed and/or numeric prefix if
14571 the alternate form is used */
14572 if (fill == ' ') {
14573 if (arg->sign) {
14574 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14575 writer->pos += 1;
14576 }
14577 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14578 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14579 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14580 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14581 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14582 writer->pos += 2;
14583 pindex += 2;
14584 }
14585 }
14586
14587 /* Write characters */
14588 if (len) {
14589 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14590 str, pindex, len);
14591 writer->pos += len;
14592 }
14593
14594 /* Pad right with the fill character if needed */
14595 if (arg->width > len) {
14596 sublen = arg->width - len;
14597 FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14598 writer->pos += sublen;
14599 }
14600 return 0;
14601}
14602
14603/* Helper of PyUnicode_Format(): format one arg.
14604 Return 0 on success, raise an exception and return -1 on error. */
14605static int
14606unicode_format_arg(struct unicode_formatter_t *ctx)
14607{
14608 struct unicode_format_arg_t arg;
14609 PyObject *str;
14610 int ret;
14611
Victor Stinner8dbd4212012-12-04 09:30:24 +010014612 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14613 arg.flags = 0;
14614 arg.width = -1;
14615 arg.prec = -1;
14616 arg.sign = 0;
14617 str = NULL;
14618
Victor Stinnera47082312012-10-04 02:19:54 +020014619 ret = unicode_format_arg_parse(ctx, &arg);
14620 if (ret == -1)
14621 return -1;
14622
14623 ret = unicode_format_arg_format(ctx, &arg, &str);
14624 if (ret == -1)
14625 return -1;
14626
14627 if (ret != 1) {
14628 ret = unicode_format_arg_output(ctx, &arg, str);
14629 Py_DECREF(str);
14630 if (ret == -1)
14631 return -1;
14632 }
14633
14634 if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14635 PyErr_SetString(PyExc_TypeError,
14636 "not all arguments converted during string formatting");
14637 return -1;
14638 }
14639 return 0;
14640}
14641
Alexander Belopolsky40018472011-02-26 01:02:56 +000014642PyObject *
14643PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014644{
Victor Stinnera47082312012-10-04 02:19:54 +020014645 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014646
Guido van Rossumd57fd912000-03-10 22:53:23 +000014647 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014648 PyErr_BadInternalCall();
14649 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014650 }
Victor Stinnera47082312012-10-04 02:19:54 +020014651
14652 ctx.fmtstr = PyUnicode_FromObject(format);
14653 if (ctx.fmtstr == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +000014654 return NULL;
Victor Stinnera47082312012-10-04 02:19:54 +020014655 if (PyUnicode_READY(ctx.fmtstr) == -1) {
14656 Py_DECREF(ctx.fmtstr);
14657 return NULL;
14658 }
14659 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14660 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14661 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14662 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014663
Victor Stinner8f674cc2013-04-17 23:02:17 +020014664 _PyUnicodeWriter_Init(&ctx.writer);
14665 ctx.writer.min_length = ctx.fmtcnt + 100;
14666 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014667
Guido van Rossumd57fd912000-03-10 22:53:23 +000014668 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014669 ctx.arglen = PyTuple_Size(args);
14670 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014671 }
14672 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014673 ctx.arglen = -1;
14674 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014675 }
Victor Stinnera47082312012-10-04 02:19:54 +020014676 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014677 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014678 ctx.dict = args;
14679 else
14680 ctx.dict = NULL;
14681 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014682
Victor Stinnera47082312012-10-04 02:19:54 +020014683 while (--ctx.fmtcnt >= 0) {
14684 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014685 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014686
14687 nonfmtpos = ctx.fmtpos++;
14688 while (ctx.fmtcnt >= 0 &&
14689 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14690 ctx.fmtpos++;
14691 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014692 }
Victor Stinnera47082312012-10-04 02:19:54 +020014693 if (ctx.fmtcnt < 0) {
14694 ctx.fmtpos--;
14695 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014696 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014697
Victor Stinnercfc4c132013-04-03 01:48:39 +020014698 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14699 nonfmtpos, ctx.fmtpos) < 0)
14700 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014701 }
14702 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014703 ctx.fmtpos++;
14704 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014705 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014706 }
14707 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014708
Victor Stinnera47082312012-10-04 02:19:54 +020014709 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014710 PyErr_SetString(PyExc_TypeError,
14711 "not all arguments converted during string formatting");
14712 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014713 }
14714
Victor Stinnera47082312012-10-04 02:19:54 +020014715 if (ctx.args_owned) {
14716 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014717 }
Victor Stinnera47082312012-10-04 02:19:54 +020014718 Py_DECREF(ctx.fmtstr);
14719 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014720
Benjamin Peterson29060642009-01-31 22:14:21 +000014721 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020014722 Py_DECREF(ctx.fmtstr);
14723 _PyUnicodeWriter_Dealloc(&ctx.writer);
14724 if (ctx.args_owned) {
14725 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014726 }
14727 return NULL;
14728}
14729
Jeremy Hylton938ace62002-07-17 16:30:39 +000014730static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000014731unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14732
Tim Peters6d6c1a32001-08-02 04:15:00 +000014733static PyObject *
14734unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14735{
Benjamin Peterson29060642009-01-31 22:14:21 +000014736 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014737 static char *kwlist[] = {"object", "encoding", "errors", 0};
14738 char *encoding = NULL;
14739 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000014740
Benjamin Peterson14339b62009-01-31 16:36:08 +000014741 if (type != &PyUnicode_Type)
14742 return unicode_subtype_new(type, args, kwds);
14743 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000014744 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000014745 return NULL;
14746 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020014747 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000014748 if (encoding == NULL && errors == NULL)
14749 return PyObject_Str(x);
14750 else
Benjamin Peterson29060642009-01-31 22:14:21 +000014751 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000014752}
14753
Guido van Rossume023fe02001-08-30 03:12:59 +000014754static PyObject *
14755unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14756{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014757 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014758 Py_ssize_t length, char_size;
14759 int share_wstr, share_utf8;
14760 unsigned int kind;
14761 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000014762
Benjamin Peterson14339b62009-01-31 16:36:08 +000014763 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014764
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014765 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014766 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014767 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020014768 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050014769 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060014770 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014771 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060014772 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014773
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014774 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014775 if (self == NULL) {
14776 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000014777 return NULL;
14778 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014779 kind = PyUnicode_KIND(unicode);
14780 length = PyUnicode_GET_LENGTH(unicode);
14781
14782 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014783#ifdef Py_DEBUG
14784 _PyUnicode_HASH(self) = -1;
14785#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014786 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014787#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014788 _PyUnicode_STATE(self).interned = 0;
14789 _PyUnicode_STATE(self).kind = kind;
14790 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020014791 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014792 _PyUnicode_STATE(self).ready = 1;
14793 _PyUnicode_WSTR(self) = NULL;
14794 _PyUnicode_UTF8_LENGTH(self) = 0;
14795 _PyUnicode_UTF8(self) = NULL;
14796 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020014797 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014798
14799 share_utf8 = 0;
14800 share_wstr = 0;
14801 if (kind == PyUnicode_1BYTE_KIND) {
14802 char_size = 1;
14803 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14804 share_utf8 = 1;
14805 }
14806 else if (kind == PyUnicode_2BYTE_KIND) {
14807 char_size = 2;
14808 if (sizeof(wchar_t) == 2)
14809 share_wstr = 1;
14810 }
14811 else {
14812 assert(kind == PyUnicode_4BYTE_KIND);
14813 char_size = 4;
14814 if (sizeof(wchar_t) == 4)
14815 share_wstr = 1;
14816 }
14817
14818 /* Ensure we won't overflow the length. */
14819 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14820 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014821 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014822 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014823 data = PyObject_MALLOC((length + 1) * char_size);
14824 if (data == NULL) {
14825 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014826 goto onError;
14827 }
14828
Victor Stinnerc3c74152011-10-02 20:39:55 +020014829 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014830 if (share_utf8) {
14831 _PyUnicode_UTF8_LENGTH(self) = length;
14832 _PyUnicode_UTF8(self) = data;
14833 }
14834 if (share_wstr) {
14835 _PyUnicode_WSTR_LENGTH(self) = length;
14836 _PyUnicode_WSTR(self) = (wchar_t *)data;
14837 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014838
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014839 Py_MEMCPY(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020014840 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014841 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020014842#ifdef Py_DEBUG
14843 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14844#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020014845 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010014846 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020014847
14848onError:
14849 Py_DECREF(unicode);
14850 Py_DECREF(self);
14851 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000014852}
14853
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000014854PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070014855"str(object='') -> str\n\
14856str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000014857\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100014858Create a new string object from the given object. If encoding or\n\
14859errors is specified, then the object must expose a data buffer\n\
14860that will be decoded using the given encoding and error handler.\n\
14861Otherwise, returns the result of object.__str__() (if defined)\n\
14862or repr(object).\n\
14863encoding defaults to sys.getdefaultencoding().\n\
14864errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000014865
Guido van Rossum50e9fb92006-08-17 05:42:55 +000014866static PyObject *unicode_iter(PyObject *seq);
14867
Guido van Rossumd57fd912000-03-10 22:53:23 +000014868PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000014869 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Benjamin Peterson14339b62009-01-31 16:36:08 +000014870 "str", /* tp_name */
14871 sizeof(PyUnicodeObject), /* tp_size */
14872 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014873 /* Slots */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014874 (destructor)unicode_dealloc, /* tp_dealloc */
14875 0, /* tp_print */
14876 0, /* tp_getattr */
14877 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000014878 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014879 unicode_repr, /* tp_repr */
14880 &unicode_as_number, /* tp_as_number */
14881 &unicode_as_sequence, /* tp_as_sequence */
14882 &unicode_as_mapping, /* tp_as_mapping */
14883 (hashfunc) unicode_hash, /* tp_hash*/
14884 0, /* tp_call*/
14885 (reprfunc) unicode_str, /* tp_str */
14886 PyObject_GenericGetAttr, /* tp_getattro */
14887 0, /* tp_setattro */
14888 0, /* tp_as_buffer */
14889 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Benjamin Peterson29060642009-01-31 22:14:21 +000014890 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
Benjamin Peterson14339b62009-01-31 16:36:08 +000014891 unicode_doc, /* tp_doc */
14892 0, /* tp_traverse */
14893 0, /* tp_clear */
14894 PyUnicode_RichCompare, /* tp_richcompare */
14895 0, /* tp_weaklistoffset */
14896 unicode_iter, /* tp_iter */
14897 0, /* tp_iternext */
14898 unicode_methods, /* tp_methods */
14899 0, /* tp_members */
14900 0, /* tp_getset */
14901 &PyBaseObject_Type, /* tp_base */
14902 0, /* tp_dict */
14903 0, /* tp_descr_get */
14904 0, /* tp_descr_set */
14905 0, /* tp_dictoffset */
14906 0, /* tp_init */
14907 0, /* tp_alloc */
14908 unicode_new, /* tp_new */
14909 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014910};
14911
14912/* Initialize the Unicode implementation */
14913
Victor Stinner3a50e702011-10-18 21:21:00 +020014914int _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014915{
Thomas Wouters477c8d52006-05-27 19:21:47 +000014916 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014917 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000014918 0x000A, /* LINE FEED */
14919 0x000D, /* CARRIAGE RETURN */
14920 0x001C, /* FILE SEPARATOR */
14921 0x001D, /* GROUP SEPARATOR */
14922 0x001E, /* RECORD SEPARATOR */
14923 0x0085, /* NEXT LINE */
14924 0x2028, /* LINE SEPARATOR */
14925 0x2029, /* PARAGRAPH SEPARATOR */
14926 };
14927
Fred Drakee4315f52000-05-09 19:53:39 +000014928 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020014929 _Py_INCREF_UNICODE_EMPTY();
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014930 if (!unicode_empty)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014931 Py_FatalError("Can't create empty string");
Serhiy Storchaka678db842013-01-26 12:16:36 +020014932 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014933
Guido van Rossumcacfc072002-05-24 19:01:59 +000014934 if (PyType_Ready(&PyUnicode_Type) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014935 Py_FatalError("Can't initialize 'unicode'");
Thomas Wouters477c8d52006-05-27 19:21:47 +000014936
14937 /* initialize the linebreak bloom filter */
14938 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014939 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020014940 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014941
Christian Heimes26532f72013-07-20 14:57:16 +020014942 if (PyType_Ready(&EncodingMapType) < 0)
14943 Py_FatalError("Can't initialize encoding map type");
Victor Stinner3a50e702011-10-18 21:21:00 +020014944
Benjamin Petersonc4311282012-10-30 23:21:10 -040014945 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14946 Py_FatalError("Can't initialize field name iterator type");
14947
14948 if (PyType_Ready(&PyFormatterIter_Type) < 0)
14949 Py_FatalError("Can't initialize formatter iter type");
Benjamin Petersone8ea97f2012-10-30 23:27:52 -040014950
Victor Stinner3a50e702011-10-18 21:21:00 +020014951 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014952}
14953
14954/* Finalize the Unicode implementation */
14955
Christian Heimesa156e092008-02-16 07:38:31 +000014956int
14957PyUnicode_ClearFreeList(void)
14958{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014959 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000014960}
14961
Guido van Rossumd57fd912000-03-10 22:53:23 +000014962void
Thomas Wouters78890102000-07-22 19:25:51 +000014963_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014964{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000014965 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014966
Serhiy Storchaka05997252013-01-26 12:14:02 +020014967 Py_CLEAR(unicode_empty);
Barry Warsaw5b4c2282000-10-03 20:45:26 +000014968
Serhiy Storchaka05997252013-01-26 12:14:02 +020014969 for (i = 0; i < 256; i++)
14970 Py_CLEAR(unicode_latin1[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +020014971 _PyUnicode_ClearStaticStrings();
Christian Heimesa156e092008-02-16 07:38:31 +000014972 (void)PyUnicode_ClearFreeList();
Guido van Rossumd57fd912000-03-10 22:53:23 +000014973}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000014974
Walter Dörwald16807132007-05-25 13:52:07 +000014975void
14976PyUnicode_InternInPlace(PyObject **p)
14977{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020014978 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014979 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020014980#ifdef Py_DEBUG
14981 assert(s != NULL);
14982 assert(_PyUnicode_CHECK(s));
14983#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000014984 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020014985 return;
14986#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000014987 /* If it's a subclass, we don't really know what putting
14988 it in the interned dict might do. */
14989 if (!PyUnicode_CheckExact(s))
14990 return;
14991 if (PyUnicode_CHECK_INTERNED(s))
14992 return;
14993 if (interned == NULL) {
14994 interned = PyDict_New();
14995 if (interned == NULL) {
14996 PyErr_Clear(); /* Don't leave an exception */
14997 return;
14998 }
14999 }
15000 /* It might be that the GetItem call fails even
15001 though the key is present in the dictionary,
15002 namely when this happens during a stack overflow. */
15003 Py_ALLOW_RECURSION
Victor Stinner7931d9a2011-11-04 00:22:48 +010015004 t = PyDict_GetItem(interned, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015005 Py_END_ALLOW_RECURSION
Martin v. Löwis5b222132007-06-10 09:51:05 +000015006
Victor Stinnerf0335102013-04-14 19:13:03 +020015007 if (t) {
15008 Py_INCREF(t);
Serhiy Storchaka5a57ade2015-12-24 10:35:59 +020015009 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015010 return;
15011 }
Walter Dörwald16807132007-05-25 13:52:07 +000015012
Benjamin Peterson14339b62009-01-31 16:36:08 +000015013 PyThreadState_GET()->recursion_critical = 1;
Victor Stinner7931d9a2011-11-04 00:22:48 +010015014 if (PyDict_SetItem(interned, s, s) < 0) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015015 PyErr_Clear();
15016 PyThreadState_GET()->recursion_critical = 0;
15017 return;
15018 }
15019 PyThreadState_GET()->recursion_critical = 0;
15020 /* The two references in interned are not counted by refcnt.
15021 The deallocator will take care of this */
15022 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015023 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015024}
15025
15026void
15027PyUnicode_InternImmortal(PyObject **p)
15028{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015029 PyUnicode_InternInPlace(p);
15030 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015031 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015032 Py_INCREF(*p);
15033 }
Walter Dörwald16807132007-05-25 13:52:07 +000015034}
15035
15036PyObject *
15037PyUnicode_InternFromString(const char *cp)
15038{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015039 PyObject *s = PyUnicode_FromString(cp);
15040 if (s == NULL)
15041 return NULL;
15042 PyUnicode_InternInPlace(&s);
15043 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015044}
15045
Alexander Belopolsky40018472011-02-26 01:02:56 +000015046void
15047_Py_ReleaseInternedUnicodeStrings(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015048{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015049 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015050 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015051 Py_ssize_t i, n;
15052 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015053
Benjamin Peterson14339b62009-01-31 16:36:08 +000015054 if (interned == NULL || !PyDict_Check(interned))
15055 return;
15056 keys = PyDict_Keys(interned);
15057 if (keys == NULL || !PyList_Check(keys)) {
15058 PyErr_Clear();
15059 return;
15060 }
Walter Dörwald16807132007-05-25 13:52:07 +000015061
Benjamin Peterson14339b62009-01-31 16:36:08 +000015062 /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15063 detector, interned unicode strings are not forcibly deallocated;
15064 rather, we give them their stolen references back, and then clear
15065 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015066
Benjamin Peterson14339b62009-01-31 16:36:08 +000015067 n = PyList_GET_SIZE(keys);
15068 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015069 n);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015070 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015071 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015072 if (PyUnicode_READY(s) == -1) {
15073 assert(0 && "could not ready string");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015074 fprintf(stderr, "could not ready string\n");
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015075 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015076 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015077 case SSTATE_NOT_INTERNED:
15078 /* XXX Shouldn't happen */
15079 break;
15080 case SSTATE_INTERNED_IMMORTAL:
15081 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015082 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015083 break;
15084 case SSTATE_INTERNED_MORTAL:
15085 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015086 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015087 break;
15088 default:
15089 Py_FatalError("Inconsistent interned string state.");
15090 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015091 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015092 }
15093 fprintf(stderr, "total size of all interned strings: "
15094 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15095 "mortal/immortal\n", mortal_size, immortal_size);
15096 Py_DECREF(keys);
15097 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015098 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015099}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015100
15101
15102/********************* Unicode Iterator **************************/
15103
15104typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015105 PyObject_HEAD
15106 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015107 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015108} unicodeiterobject;
15109
15110static void
15111unicodeiter_dealloc(unicodeiterobject *it)
15112{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015113 _PyObject_GC_UNTRACK(it);
15114 Py_XDECREF(it->it_seq);
15115 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015116}
15117
15118static int
15119unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15120{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015121 Py_VISIT(it->it_seq);
15122 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015123}
15124
15125static PyObject *
15126unicodeiter_next(unicodeiterobject *it)
15127{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015128 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015129
Benjamin Peterson14339b62009-01-31 16:36:08 +000015130 assert(it != NULL);
15131 seq = it->it_seq;
15132 if (seq == NULL)
15133 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015134 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015135
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015136 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15137 int kind = PyUnicode_KIND(seq);
15138 void *data = PyUnicode_DATA(seq);
15139 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15140 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015141 if (item != NULL)
15142 ++it->it_index;
15143 return item;
15144 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015145
Benjamin Peterson14339b62009-01-31 16:36:08 +000015146 Py_DECREF(seq);
15147 it->it_seq = NULL;
15148 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015149}
15150
15151static PyObject *
15152unicodeiter_len(unicodeiterobject *it)
15153{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015154 Py_ssize_t len = 0;
15155 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015156 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015157 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015158}
15159
15160PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15161
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015162static PyObject *
15163unicodeiter_reduce(unicodeiterobject *it)
15164{
15165 if (it->it_seq != NULL) {
Antoine Pitroua7013882012-04-05 00:04:20 +020015166 return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015167 it->it_seq, it->it_index);
15168 } else {
15169 PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15170 if (u == NULL)
15171 return NULL;
Antoine Pitroua7013882012-04-05 00:04:20 +020015172 return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015173 }
15174}
15175
15176PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15177
15178static PyObject *
15179unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15180{
15181 Py_ssize_t index = PyLong_AsSsize_t(state);
15182 if (index == -1 && PyErr_Occurred())
15183 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015184 if (it->it_seq != NULL) {
15185 if (index < 0)
15186 index = 0;
15187 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15188 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15189 it->it_index = index;
15190 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015191 Py_RETURN_NONE;
15192}
15193
15194PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15195
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015196static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015197 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015198 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015199 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15200 reduce_doc},
15201 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15202 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015203 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015204};
15205
15206PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015207 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15208 "str_iterator", /* tp_name */
15209 sizeof(unicodeiterobject), /* tp_basicsize */
15210 0, /* tp_itemsize */
15211 /* methods */
15212 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15213 0, /* tp_print */
15214 0, /* tp_getattr */
15215 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015216 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015217 0, /* tp_repr */
15218 0, /* tp_as_number */
15219 0, /* tp_as_sequence */
15220 0, /* tp_as_mapping */
15221 0, /* tp_hash */
15222 0, /* tp_call */
15223 0, /* tp_str */
15224 PyObject_GenericGetAttr, /* tp_getattro */
15225 0, /* tp_setattro */
15226 0, /* tp_as_buffer */
15227 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15228 0, /* tp_doc */
15229 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15230 0, /* tp_clear */
15231 0, /* tp_richcompare */
15232 0, /* tp_weaklistoffset */
15233 PyObject_SelfIter, /* tp_iter */
15234 (iternextfunc)unicodeiter_next, /* tp_iternext */
15235 unicodeiter_methods, /* tp_methods */
15236 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015237};
15238
15239static PyObject *
15240unicode_iter(PyObject *seq)
15241{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015242 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015243
Benjamin Peterson14339b62009-01-31 16:36:08 +000015244 if (!PyUnicode_Check(seq)) {
15245 PyErr_BadInternalCall();
15246 return NULL;
15247 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015248 if (PyUnicode_READY(seq) == -1)
15249 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015250 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15251 if (it == NULL)
15252 return NULL;
15253 it->it_index = 0;
15254 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015255 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015256 _PyObject_GC_TRACK(it);
15257 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015258}
15259
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015260
15261size_t
15262Py_UNICODE_strlen(const Py_UNICODE *u)
15263{
15264 int res = 0;
15265 while(*u++)
15266 res++;
15267 return res;
15268}
15269
15270Py_UNICODE*
15271Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15272{
15273 Py_UNICODE *u = s1;
15274 while ((*u++ = *s2++));
15275 return s1;
15276}
15277
15278Py_UNICODE*
15279Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15280{
15281 Py_UNICODE *u = s1;
15282 while ((*u++ = *s2++))
15283 if (n-- == 0)
15284 break;
15285 return s1;
15286}
15287
15288Py_UNICODE*
15289Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15290{
15291 Py_UNICODE *u1 = s1;
15292 u1 += Py_UNICODE_strlen(u1);
15293 Py_UNICODE_strcpy(u1, s2);
15294 return s1;
15295}
15296
15297int
15298Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15299{
15300 while (*s1 && *s2 && *s1 == *s2)
15301 s1++, s2++;
15302 if (*s1 && *s2)
15303 return (*s1 < *s2) ? -1 : +1;
15304 if (*s1)
15305 return 1;
15306 if (*s2)
15307 return -1;
15308 return 0;
15309}
15310
15311int
15312Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15313{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015314 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015315 for (; n != 0; n--) {
15316 u1 = *s1;
15317 u2 = *s2;
15318 if (u1 != u2)
15319 return (u1 < u2) ? -1 : +1;
15320 if (u1 == '\0')
15321 return 0;
15322 s1++;
15323 s2++;
15324 }
15325 return 0;
15326}
15327
15328Py_UNICODE*
15329Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15330{
15331 const Py_UNICODE *p;
15332 for (p = s; *p; p++)
15333 if (*p == c)
15334 return (Py_UNICODE*)p;
15335 return NULL;
15336}
15337
15338Py_UNICODE*
15339Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15340{
15341 const Py_UNICODE *p;
15342 p = s + Py_UNICODE_strlen(s);
15343 while (p != s) {
15344 p--;
15345 if (*p == c)
15346 return (Py_UNICODE*)p;
15347 }
15348 return NULL;
15349}
Victor Stinner331ea922010-08-10 16:37:20 +000015350
Victor Stinner71133ff2010-09-01 23:43:53 +000015351Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015352PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015353{
Victor Stinner577db2c2011-10-11 22:12:48 +020015354 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015355 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015356
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015357 if (!PyUnicode_Check(unicode)) {
15358 PyErr_BadArgument();
15359 return NULL;
15360 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015361 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015362 if (u == NULL)
15363 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015364 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015365 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015366 PyErr_NoMemory();
15367 return NULL;
15368 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015369 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015370 size *= sizeof(Py_UNICODE);
15371 copy = PyMem_Malloc(size);
15372 if (copy == NULL) {
15373 PyErr_NoMemory();
15374 return NULL;
15375 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015376 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015377 return copy;
15378}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015379
Georg Brandl66c221e2010-10-14 07:04:07 +000015380/* A _string module, to export formatter_parser and formatter_field_name_split
15381 to the string.Formatter class implemented in Python. */
15382
15383static PyMethodDef _string_methods[] = {
15384 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15385 METH_O, PyDoc_STR("split the argument as a field name")},
15386 {"formatter_parser", (PyCFunction) formatter_parser,
15387 METH_O, PyDoc_STR("parse the argument as a format string")},
15388 {NULL, NULL}
15389};
15390
15391static struct PyModuleDef _string_module = {
15392 PyModuleDef_HEAD_INIT,
15393 "_string",
15394 PyDoc_STR("string helper module"),
15395 0,
15396 _string_methods,
15397 NULL,
15398 NULL,
15399 NULL,
15400 NULL
15401};
15402
15403PyMODINIT_FUNC
15404PyInit__string(void)
15405{
15406 return PyModule_Create(&_string_module);
15407}
15408
15409
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015410#ifdef __cplusplus
15411}
15412#endif