blob: cb1456ea847acfd813d3c5f37c46e9692789fe43 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner331a6a52019-05-27 16:39:22 +020043#include "pycore_initconfig.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010044#include "pycore_fileutils.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010045#include "pycore_object.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040046#include "pycore_pylifecycle.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010047#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000048#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050049#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070050#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000051
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000052#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000053#include <windows.h>
54#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000055
Victor Stinnerfecc4f22019-03-19 14:20:29 +010056/* Uncomment to display statistics on interned strings at exit when
57 using Valgrind or Insecure++. */
58/* #define INTERNED_STATS 1 */
59
60
Larry Hastings61272b72014-01-07 12:41:53 -080061/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090062class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080063[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090064/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
65
66/*[python input]
67class Py_UCS4_converter(CConverter):
68 type = 'Py_UCS4'
69 converter = 'convert_uc'
70
71 def converter_init(self):
72 if self.default is not unspecified:
73 self.c_default = ascii(self.default)
74 if len(self.c_default) > 4 or self.c_default[0] != "'":
75 self.c_default = hex(ord(self.default))
76
77[python start generated code]*/
78/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080079
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
Serhiy Storchaka05997252013-01-26 12:14:02 +020082NOTE: In the interpreter's initialization phase, some globals are currently
83 initialized dynamically as needed. In the process Unicode objects may
84 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000085
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Victor Stinner8faf8212011-12-08 22:14:11 +010093/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
94#define MAX_UNICODE 0x10ffff
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100142 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinnerc379ead2011-10-03 12:52:27 +0200144#define _PyUnicode_SHARE_UTF8(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
147 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
148#define _PyUnicode_SHARE_WSTR(op) \
149 (assert(_PyUnicode_CHECK(op)), \
150 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
151
Victor Stinner829c0ad2011-10-03 01:08:02 +0200152/* true if the Unicode object has an allocated UTF-8 memory block
153 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200154#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200155 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200156 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
158
Victor Stinner03490912011-10-03 23:45:12 +0200159/* true if the Unicode object has an allocated wstr memory block
160 (not shared with other data) */
161#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200162 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200163 (!PyUnicode_IS_READY(op) || \
164 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
165
Victor Stinner910337b2011-10-03 03:20:16 +0200166/* Generic helper macro to convert characters of different types.
167 from_type and to_type have to be valid type names, begin and end
168 are pointers to the source characters which should be of type
169 "from_type *". to is a pointer of type "to_type *" and points to the
170 buffer where the result characters are written to. */
171#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
172 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100173 to_type *_to = (to_type *)(to); \
174 const from_type *_iter = (from_type *)(begin); \
175 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200176 Py_ssize_t n = (_end) - (_iter); \
177 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200178 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200179 while (_iter < (_unrolled_end)) { \
180 _to[0] = (to_type) _iter[0]; \
181 _to[1] = (to_type) _iter[1]; \
182 _to[2] = (to_type) _iter[2]; \
183 _to[3] = (to_type) _iter[3]; \
184 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200185 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200186 while (_iter < (_end)) \
187 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200188 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200189
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200190#ifdef MS_WINDOWS
191 /* On Windows, overallocate by 50% is the best factor */
192# define OVERALLOCATE_FACTOR 2
193#else
194 /* On Linux, overallocate by 25% is the best factor */
195# define OVERALLOCATE_FACTOR 4
196#endif
197
Walter Dörwald16807132007-05-25 13:52:07 +0000198/* This dictionary holds all interned unicode strings. Note that references
199 to strings in this dictionary are *not* counted in the string's ob_refcnt.
200 When the interned string reaches a refcnt of 0 the string deallocation
201 function will delete the reference from this dictionary.
202
203 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000204 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000205*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200206static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000207
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000208/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200209static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200210
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212 do { \
213 if (unicode_empty != NULL) \
214 Py_INCREF(unicode_empty); \
215 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200216 unicode_empty = PyUnicode_New(0, 0); \
217 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200218 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200219 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
220 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200221 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200222 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000223
Serhiy Storchaka678db842013-01-26 12:16:36 +0200224#define _Py_RETURN_UNICODE_EMPTY() \
225 do { \
226 _Py_INCREF_UNICODE_EMPTY(); \
227 return unicode_empty; \
228 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000229
Victor Stinner59423e32018-11-26 13:40:01 +0100230static inline void
231unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
232 Py_ssize_t start, Py_ssize_t length)
233{
234 assert(0 <= start);
235 assert(kind != PyUnicode_WCHAR_KIND);
236 switch (kind) {
237 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100238 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100239 Py_UCS1 ch = (unsigned char)value;
240 Py_UCS1 *to = (Py_UCS1 *)data + start;
241 memset(to, ch, length);
242 break;
243 }
244 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100245 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100246 Py_UCS2 ch = (Py_UCS2)value;
247 Py_UCS2 *to = (Py_UCS2 *)data + start;
248 const Py_UCS2 *end = to + length;
249 for (; to < end; ++to) *to = ch;
250 break;
251 }
252 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100253 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100254 Py_UCS4 ch = value;
255 Py_UCS4 * to = (Py_UCS4 *)data + start;
256 const Py_UCS4 *end = to + length;
257 for (; to < end; ++to) *to = ch;
258 break;
259 }
260 default: Py_UNREACHABLE();
261 }
262}
263
264
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200265/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700266static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200267_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900268static inline void
269_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400270static PyObject *
271unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
272 const char *errors);
273static PyObject *
274unicode_decode_utf8(const char *s, Py_ssize_t size,
275 _Py_error_handler error_handler, const char *errors,
276 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200277
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200278/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200279static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200280
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000281/* Single character Unicode strings in the Latin-1 range are being
282 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200283static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000284
Christian Heimes190d79e2008-01-30 11:58:22 +0000285/* Fast detection of the most frequent whitespace characters */
286const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000287 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000288/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000289/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000290/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000291/* case 0x000C: * FORM FEED */
292/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000293 0, 1, 1, 1, 1, 1, 0, 0,
294 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000295/* case 0x001C: * FILE SEPARATOR */
296/* case 0x001D: * GROUP SEPARATOR */
297/* case 0x001E: * RECORD SEPARATOR */
298/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000300/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 1, 0, 0, 0, 0, 0, 0, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000305
Benjamin Peterson14339b62009-01-31 16:36:08 +0000306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000314};
315
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200316/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200317static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200318static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100319static int unicode_modifiable(PyObject *unicode);
320
Victor Stinnerfe226c02011-10-03 03:52:20 +0200321
Alexander Belopolsky40018472011-02-26 01:02:56 +0000322static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100323_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200324static PyObject *
325_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
326static PyObject *
327_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
328
329static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000330unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000331 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100332 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000333 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
334
Alexander Belopolsky40018472011-02-26 01:02:56 +0000335static void
336raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300337 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100338 PyObject *unicode,
339 Py_ssize_t startpos, Py_ssize_t endpos,
340 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000341
Christian Heimes190d79e2008-01-30 11:58:22 +0000342/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200343static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000344 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000345/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000346/* 0x000B, * LINE TABULATION */
347/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000348/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000349 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000350 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000351/* 0x001C, * FILE SEPARATOR */
352/* 0x001D, * GROUP SEPARATOR */
353/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000354 0, 0, 0, 0, 1, 1, 1, 0,
355 0, 0, 0, 0, 0, 0, 0, 0,
356 0, 0, 0, 0, 0, 0, 0, 0,
357 0, 0, 0, 0, 0, 0, 0, 0,
358 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000359
Benjamin Peterson14339b62009-01-31 16:36:08 +0000360 0, 0, 0, 0, 0, 0, 0, 0,
361 0, 0, 0, 0, 0, 0, 0, 0,
362 0, 0, 0, 0, 0, 0, 0, 0,
363 0, 0, 0, 0, 0, 0, 0, 0,
364 0, 0, 0, 0, 0, 0, 0, 0,
365 0, 0, 0, 0, 0, 0, 0, 0,
366 0, 0, 0, 0, 0, 0, 0, 0,
367 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000368};
369
INADA Naoki3ae20562017-01-16 20:41:20 +0900370static int convert_uc(PyObject *obj, void *addr);
371
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300372#include "clinic/unicodeobject.c.h"
373
Victor Stinner3d4226a2018-08-29 22:21:32 +0200374_Py_error_handler
375_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200376{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200377 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200378 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200379 }
380 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200381 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200382 }
383 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200384 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200385 }
386 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200387 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200388 }
389 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200390 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200391 }
392 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200393 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200394 }
395 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200396 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200397 }
Victor Stinner50149202015-09-22 00:26:54 +0200398 return _Py_ERROR_OTHER;
399}
400
Victor Stinner709d23d2019-05-02 14:56:30 -0400401
402static _Py_error_handler
403get_error_handler_wide(const wchar_t *errors)
404{
405 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
406 return _Py_ERROR_STRICT;
407 }
408 if (wcscmp(errors, L"surrogateescape") == 0) {
409 return _Py_ERROR_SURROGATEESCAPE;
410 }
411 if (wcscmp(errors, L"replace") == 0) {
412 return _Py_ERROR_REPLACE;
413 }
414 if (wcscmp(errors, L"ignore") == 0) {
415 return _Py_ERROR_IGNORE;
416 }
417 if (wcscmp(errors, L"backslashreplace") == 0) {
418 return _Py_ERROR_BACKSLASHREPLACE;
419 }
420 if (wcscmp(errors, L"surrogatepass") == 0) {
421 return _Py_ERROR_SURROGATEPASS;
422 }
423 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
424 return _Py_ERROR_XMLCHARREFREPLACE;
425 }
426 return _Py_ERROR_OTHER;
427}
428
429
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300430/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
431 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000432Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000433PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000434{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000435#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000436 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000437#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000438 /* This is actually an illegal character, so it should
439 not be passed to unichr. */
440 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000441#endif
442}
443
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200444int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100445_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200446{
447 PyASCIIObject *ascii;
448 unsigned int kind;
449
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200450 _PyObject_ASSERT(op, PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200451
452 ascii = (PyASCIIObject *)op;
453 kind = ascii->state.kind;
454
Victor Stinnera3b334d2011-10-03 13:53:37 +0200455 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200456 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND);
457 _PyObject_ASSERT(op, ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200458 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200459 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200460 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200461 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200462
Victor Stinnera41463c2011-10-04 01:05:08 +0200463 if (ascii->state.compact == 1) {
464 data = compact + 1;
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200465 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND
466 || kind == PyUnicode_2BYTE_KIND
467 || kind == PyUnicode_4BYTE_KIND);
468 _PyObject_ASSERT(op, ascii->state.ascii == 0);
469 _PyObject_ASSERT(op, ascii->state.ready == 1);
470 _PyObject_ASSERT(op, compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100471 }
472 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200473 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
474
475 data = unicode->data.any;
476 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200477 _PyObject_ASSERT(op, ascii->length == 0);
478 _PyObject_ASSERT(op, ascii->hash == -1);
479 _PyObject_ASSERT(op, ascii->state.compact == 0);
480 _PyObject_ASSERT(op, ascii->state.ascii == 0);
481 _PyObject_ASSERT(op, ascii->state.ready == 0);
482 _PyObject_ASSERT(op, ascii->state.interned == SSTATE_NOT_INTERNED);
483 _PyObject_ASSERT(op, ascii->wstr != NULL);
484 _PyObject_ASSERT(op, data == NULL);
485 _PyObject_ASSERT(op, compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200486 }
487 else {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200488 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND
489 || kind == PyUnicode_2BYTE_KIND
490 || kind == PyUnicode_4BYTE_KIND);
491 _PyObject_ASSERT(op, ascii->state.compact == 0);
492 _PyObject_ASSERT(op, ascii->state.ready == 1);
493 _PyObject_ASSERT(op, data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200494 if (ascii->state.ascii) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200495 _PyObject_ASSERT(op, compact->utf8 == data);
496 _PyObject_ASSERT(op, compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200497 }
498 else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200499 _PyObject_ASSERT(op, compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200500 }
501 }
502 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200503 if (
504#if SIZEOF_WCHAR_T == 2
505 kind == PyUnicode_2BYTE_KIND
506#else
507 kind == PyUnicode_4BYTE_KIND
508#endif
509 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200510 {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200511 _PyObject_ASSERT(op, ascii->wstr == data);
512 _PyObject_ASSERT(op, compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200513 } else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200514 _PyObject_ASSERT(op, ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200515 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200516
517 if (compact->utf8 == NULL)
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200518 _PyObject_ASSERT(op, compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200519 if (ascii->wstr == NULL)
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200520 _PyObject_ASSERT(op, compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200521 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200522
523 /* check that the best kind is used: O(n) operation */
524 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200525 Py_ssize_t i;
526 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200527 void *data;
528 Py_UCS4 ch;
529
530 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200531 for (i=0; i < ascii->length; i++)
532 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200533 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200534 if (ch > maxchar)
535 maxchar = ch;
536 }
537 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100538 if (ascii->state.ascii == 0) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200539 _PyObject_ASSERT(op, maxchar >= 128);
540 _PyObject_ASSERT(op, maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100541 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200542 else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200543 _PyObject_ASSERT(op, maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200544 }
Victor Stinner77faf692011-11-20 18:56:05 +0100545 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200546 _PyObject_ASSERT(op, maxchar >= 0x100);
547 _PyObject_ASSERT(op, maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100548 }
549 else {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200550 _PyObject_ASSERT(op, maxchar >= 0x10000);
551 _PyObject_ASSERT(op, maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100552 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200553 _PyObject_ASSERT(op, PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200554 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400555 return 1;
556}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200557
Victor Stinner910337b2011-10-03 03:20:16 +0200558
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100559static PyObject*
560unicode_result_wchar(PyObject *unicode)
561{
562#ifndef Py_DEBUG
563 Py_ssize_t len;
564
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100565 len = _PyUnicode_WSTR_LENGTH(unicode);
566 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100567 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200568 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100569 }
570
571 if (len == 1) {
572 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100573 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100574 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
575 Py_DECREF(unicode);
576 return latin1_char;
577 }
578 }
579
580 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200581 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100582 return NULL;
583 }
584#else
Victor Stinneraa771272012-10-04 02:32:58 +0200585 assert(Py_REFCNT(unicode) == 1);
586
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100587 /* don't make the result ready in debug mode to ensure that the caller
588 makes the string ready before using it */
589 assert(_PyUnicode_CheckConsistency(unicode, 1));
590#endif
591 return unicode;
592}
593
594static PyObject*
595unicode_result_ready(PyObject *unicode)
596{
597 Py_ssize_t length;
598
599 length = PyUnicode_GET_LENGTH(unicode);
600 if (length == 0) {
601 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100602 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200603 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100604 }
605 return unicode_empty;
606 }
607
608 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200609 void *data = PyUnicode_DATA(unicode);
610 int kind = PyUnicode_KIND(unicode);
611 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100612 if (ch < 256) {
613 PyObject *latin1_char = unicode_latin1[ch];
614 if (latin1_char != NULL) {
615 if (unicode != latin1_char) {
616 Py_INCREF(latin1_char);
617 Py_DECREF(unicode);
618 }
619 return latin1_char;
620 }
621 else {
622 assert(_PyUnicode_CheckConsistency(unicode, 1));
623 Py_INCREF(unicode);
624 unicode_latin1[ch] = unicode;
625 return unicode;
626 }
627 }
628 }
629
630 assert(_PyUnicode_CheckConsistency(unicode, 1));
631 return unicode;
632}
633
634static PyObject*
635unicode_result(PyObject *unicode)
636{
637 assert(_PyUnicode_CHECK(unicode));
638 if (PyUnicode_IS_READY(unicode))
639 return unicode_result_ready(unicode);
640 else
641 return unicode_result_wchar(unicode);
642}
643
Victor Stinnerc4b49542011-12-11 22:44:26 +0100644static PyObject*
645unicode_result_unchanged(PyObject *unicode)
646{
647 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500648 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100649 return NULL;
650 Py_INCREF(unicode);
651 return unicode;
652 }
653 else
654 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100655 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100656}
657
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200658/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
659 ASCII, Latin1, UTF-8, etc. */
660static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200661backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200662 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
663{
Victor Stinnerad771582015-10-09 12:38:53 +0200664 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200665 Py_UCS4 ch;
666 enum PyUnicode_Kind kind;
667 void *data;
668
669 assert(PyUnicode_IS_READY(unicode));
670 kind = PyUnicode_KIND(unicode);
671 data = PyUnicode_DATA(unicode);
672
673 size = 0;
674 /* determine replacement size */
675 for (i = collstart; i < collend; ++i) {
676 Py_ssize_t incr;
677
678 ch = PyUnicode_READ(kind, data, i);
679 if (ch < 0x100)
680 incr = 2+2;
681 else if (ch < 0x10000)
682 incr = 2+4;
683 else {
684 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200685 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200686 }
687 if (size > PY_SSIZE_T_MAX - incr) {
688 PyErr_SetString(PyExc_OverflowError,
689 "encoded result is too long for a Python string");
690 return NULL;
691 }
692 size += incr;
693 }
694
Victor Stinnerad771582015-10-09 12:38:53 +0200695 str = _PyBytesWriter_Prepare(writer, str, size);
696 if (str == NULL)
697 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200698
699 /* generate replacement */
700 for (i = collstart; i < collend; ++i) {
701 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200702 *str++ = '\\';
703 if (ch >= 0x00010000) {
704 *str++ = 'U';
705 *str++ = Py_hexdigits[(ch>>28)&0xf];
706 *str++ = Py_hexdigits[(ch>>24)&0xf];
707 *str++ = Py_hexdigits[(ch>>20)&0xf];
708 *str++ = Py_hexdigits[(ch>>16)&0xf];
709 *str++ = Py_hexdigits[(ch>>12)&0xf];
710 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200711 }
Victor Stinner797485e2015-10-09 03:17:30 +0200712 else if (ch >= 0x100) {
713 *str++ = 'u';
714 *str++ = Py_hexdigits[(ch>>12)&0xf];
715 *str++ = Py_hexdigits[(ch>>8)&0xf];
716 }
717 else
718 *str++ = 'x';
719 *str++ = Py_hexdigits[(ch>>4)&0xf];
720 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200721 }
722 return str;
723}
724
725/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
726 ASCII, Latin1, UTF-8, etc. */
727static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200728xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200729 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
730{
Victor Stinnerad771582015-10-09 12:38:53 +0200731 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200732 Py_UCS4 ch;
733 enum PyUnicode_Kind kind;
734 void *data;
735
736 assert(PyUnicode_IS_READY(unicode));
737 kind = PyUnicode_KIND(unicode);
738 data = PyUnicode_DATA(unicode);
739
740 size = 0;
741 /* determine replacement size */
742 for (i = collstart; i < collend; ++i) {
743 Py_ssize_t incr;
744
745 ch = PyUnicode_READ(kind, data, i);
746 if (ch < 10)
747 incr = 2+1+1;
748 else if (ch < 100)
749 incr = 2+2+1;
750 else if (ch < 1000)
751 incr = 2+3+1;
752 else if (ch < 10000)
753 incr = 2+4+1;
754 else if (ch < 100000)
755 incr = 2+5+1;
756 else if (ch < 1000000)
757 incr = 2+6+1;
758 else {
759 assert(ch <= MAX_UNICODE);
760 incr = 2+7+1;
761 }
762 if (size > PY_SSIZE_T_MAX - incr) {
763 PyErr_SetString(PyExc_OverflowError,
764 "encoded result is too long for a Python string");
765 return NULL;
766 }
767 size += incr;
768 }
769
Victor Stinnerad771582015-10-09 12:38:53 +0200770 str = _PyBytesWriter_Prepare(writer, str, size);
771 if (str == NULL)
772 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200773
774 /* generate replacement */
775 for (i = collstart; i < collend; ++i) {
776 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
777 }
778 return str;
779}
780
Thomas Wouters477c8d52006-05-27 19:21:47 +0000781/* --- Bloom Filters ----------------------------------------------------- */
782
783/* stuff to implement simple "bloom filters" for Unicode characters.
784 to keep things simple, we use a single bitmask, using the least 5
785 bits from each unicode characters as the bit index. */
786
787/* the linebreak mask is set up by Unicode_Init below */
788
Antoine Pitrouf068f942010-01-13 14:19:12 +0000789#if LONG_BIT >= 128
790#define BLOOM_WIDTH 128
791#elif LONG_BIT >= 64
792#define BLOOM_WIDTH 64
793#elif LONG_BIT >= 32
794#define BLOOM_WIDTH 32
795#else
796#error "LONG_BIT is smaller than 32"
797#endif
798
Thomas Wouters477c8d52006-05-27 19:21:47 +0000799#define BLOOM_MASK unsigned long
800
Serhiy Storchaka05997252013-01-26 12:14:02 +0200801static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000802
Antoine Pitrouf068f942010-01-13 14:19:12 +0000803#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000804
Benjamin Peterson29060642009-01-31 22:14:21 +0000805#define BLOOM_LINEBREAK(ch) \
806 ((ch) < 128U ? ascii_linebreak[(ch)] : \
807 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000808
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700809static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200810make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000811{
Victor Stinnera85af502013-04-09 21:53:54 +0200812#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
813 do { \
814 TYPE *data = (TYPE *)PTR; \
815 TYPE *end = data + LEN; \
816 Py_UCS4 ch; \
817 for (; data != end; data++) { \
818 ch = *data; \
819 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
820 } \
821 break; \
822 } while (0)
823
Thomas Wouters477c8d52006-05-27 19:21:47 +0000824 /* calculate simple bloom-style bitmask for a given unicode string */
825
Antoine Pitrouf068f942010-01-13 14:19:12 +0000826 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000827
828 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200829 switch (kind) {
830 case PyUnicode_1BYTE_KIND:
831 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
832 break;
833 case PyUnicode_2BYTE_KIND:
834 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
835 break;
836 case PyUnicode_4BYTE_KIND:
837 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
838 break;
839 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700840 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200841 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000842 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200843
844#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000845}
846
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300847static int
848ensure_unicode(PyObject *obj)
849{
850 if (!PyUnicode_Check(obj)) {
851 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200852 "must be str, not %.100s",
853 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300854 return -1;
855 }
856 return PyUnicode_READY(obj);
857}
858
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200859/* Compilation of templated routines */
860
861#include "stringlib/asciilib.h"
862#include "stringlib/fastsearch.h"
863#include "stringlib/partition.h"
864#include "stringlib/split.h"
865#include "stringlib/count.h"
866#include "stringlib/find.h"
867#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200868#include "stringlib/undef.h"
869
870#include "stringlib/ucs1lib.h"
871#include "stringlib/fastsearch.h"
872#include "stringlib/partition.h"
873#include "stringlib/split.h"
874#include "stringlib/count.h"
875#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300876#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200877#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200878#include "stringlib/undef.h"
879
880#include "stringlib/ucs2lib.h"
881#include "stringlib/fastsearch.h"
882#include "stringlib/partition.h"
883#include "stringlib/split.h"
884#include "stringlib/count.h"
885#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300886#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200887#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200888#include "stringlib/undef.h"
889
890#include "stringlib/ucs4lib.h"
891#include "stringlib/fastsearch.h"
892#include "stringlib/partition.h"
893#include "stringlib/split.h"
894#include "stringlib/count.h"
895#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300896#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200897#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200898#include "stringlib/undef.h"
899
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200900#include "stringlib/unicodedefs.h"
901#include "stringlib/fastsearch.h"
902#include "stringlib/count.h"
903#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100904#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200905
Guido van Rossumd57fd912000-03-10 22:53:23 +0000906/* --- Unicode Object ----------------------------------------------------- */
907
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700908static inline Py_ssize_t
909findchar(const void *s, int kind,
910 Py_ssize_t size, Py_UCS4 ch,
911 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200912{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200913 switch (kind) {
914 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200915 if ((Py_UCS1) ch != ch)
916 return -1;
917 if (direction > 0)
918 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
919 else
920 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200921 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200922 if ((Py_UCS2) ch != ch)
923 return -1;
924 if (direction > 0)
925 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
926 else
927 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200928 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200929 if (direction > 0)
930 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
931 else
932 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200933 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700934 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200935 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200936}
937
Victor Stinnerafffce42012-10-03 23:03:17 +0200938#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000939/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200940 earlier.
941
942 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
943 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
944 invalid character in Unicode 6.0. */
945static void
946unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
947{
948 int kind = PyUnicode_KIND(unicode);
949 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
950 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
951 if (length <= old_length)
952 return;
953 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
954}
955#endif
956
Victor Stinnerfe226c02011-10-03 03:52:20 +0200957static PyObject*
958resize_compact(PyObject *unicode, Py_ssize_t length)
959{
960 Py_ssize_t char_size;
961 Py_ssize_t struct_size;
962 Py_ssize_t new_size;
963 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100964 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200965#ifdef Py_DEBUG
966 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
967#endif
968
Victor Stinner79891572012-05-03 13:43:07 +0200969 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200970 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100971 assert(PyUnicode_IS_COMPACT(unicode));
972
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200973 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100974 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200975 struct_size = sizeof(PyASCIIObject);
976 else
977 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200978 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200979
Victor Stinnerfe226c02011-10-03 03:52:20 +0200980 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
981 PyErr_NoMemory();
982 return NULL;
983 }
984 new_size = (struct_size + (length + 1) * char_size);
985
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200986 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
987 PyObject_DEL(_PyUnicode_UTF8(unicode));
988 _PyUnicode_UTF8(unicode) = NULL;
989 _PyUnicode_UTF8_LENGTH(unicode) = 0;
990 }
Victor Stinner84def372011-12-11 20:04:56 +0100991 _Py_DEC_REFTOTAL;
992 _Py_ForgetReference(unicode);
993
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300994 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100995 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100996 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200997 PyErr_NoMemory();
998 return NULL;
999 }
Victor Stinner84def372011-12-11 20:04:56 +01001000 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001001 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001002
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001004 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001005 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001006 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001007 _PyUnicode_WSTR_LENGTH(unicode) = length;
1008 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001009 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1010 PyObject_DEL(_PyUnicode_WSTR(unicode));
1011 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001012 if (!PyUnicode_IS_ASCII(unicode))
1013 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001014 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001015#ifdef Py_DEBUG
1016 unicode_fill_invalid(unicode, old_length);
1017#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001018 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1019 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001020 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001021 return unicode;
1022}
1023
Alexander Belopolsky40018472011-02-26 01:02:56 +00001024static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001025resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001026{
Victor Stinner95663112011-10-04 01:03:50 +02001027 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001028 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001029 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001031
Victor Stinnerfe226c02011-10-03 03:52:20 +02001032 if (PyUnicode_IS_READY(unicode)) {
1033 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001034 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001035 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001036#ifdef Py_DEBUG
1037 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1038#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001039
1040 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001041 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001042 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1043 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001044
1045 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1046 PyErr_NoMemory();
1047 return -1;
1048 }
1049 new_size = (length + 1) * char_size;
1050
Victor Stinner7a9105a2011-12-12 00:13:42 +01001051 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1052 {
1053 PyObject_DEL(_PyUnicode_UTF8(unicode));
1054 _PyUnicode_UTF8(unicode) = NULL;
1055 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1056 }
1057
Victor Stinnerfe226c02011-10-03 03:52:20 +02001058 data = (PyObject *)PyObject_REALLOC(data, new_size);
1059 if (data == NULL) {
1060 PyErr_NoMemory();
1061 return -1;
1062 }
1063 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001064 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001065 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001066 _PyUnicode_WSTR_LENGTH(unicode) = length;
1067 }
1068 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001069 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001070 _PyUnicode_UTF8_LENGTH(unicode) = length;
1071 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001072 _PyUnicode_LENGTH(unicode) = length;
1073 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001074#ifdef Py_DEBUG
1075 unicode_fill_invalid(unicode, old_length);
1076#endif
Victor Stinner95663112011-10-04 01:03:50 +02001077 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001078 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001079 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001080 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001081 }
Victor Stinner95663112011-10-04 01:03:50 +02001082 assert(_PyUnicode_WSTR(unicode) != NULL);
1083
1084 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001085 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001086 PyErr_NoMemory();
1087 return -1;
1088 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001089 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001090 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001091 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001092 if (!wstr) {
1093 PyErr_NoMemory();
1094 return -1;
1095 }
1096 _PyUnicode_WSTR(unicode) = wstr;
1097 _PyUnicode_WSTR(unicode)[length] = 0;
1098 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001099 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001100 return 0;
1101}
1102
Victor Stinnerfe226c02011-10-03 03:52:20 +02001103static PyObject*
1104resize_copy(PyObject *unicode, Py_ssize_t length)
1105{
1106 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001107 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001108 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001109
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001110 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001111
1112 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1113 if (copy == NULL)
1114 return NULL;
1115
1116 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001117 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001118 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001119 }
1120 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001121 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001122
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001123 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001124 if (w == NULL)
1125 return NULL;
1126 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1127 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001128 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001129 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001130 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001131 }
1132}
1133
Guido van Rossumd57fd912000-03-10 22:53:23 +00001134/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001135 Ux0000 terminated; some code (e.g. new_identifier)
1136 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001137
1138 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001139 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001140
1141*/
1142
Alexander Belopolsky40018472011-02-26 01:02:56 +00001143static PyUnicodeObject *
1144_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001145{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001146 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001147 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001148
Thomas Wouters477c8d52006-05-27 19:21:47 +00001149 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001150 if (length == 0 && unicode_empty != NULL) {
1151 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001152 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001153 }
1154
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001155 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001156 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001157 return (PyUnicodeObject *)PyErr_NoMemory();
1158 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001159 if (length < 0) {
1160 PyErr_SetString(PyExc_SystemError,
1161 "Negative size passed to _PyUnicode_New");
1162 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001163 }
1164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001165 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1166 if (unicode == NULL)
1167 return NULL;
1168 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001169
1170 _PyUnicode_WSTR_LENGTH(unicode) = length;
1171 _PyUnicode_HASH(unicode) = -1;
1172 _PyUnicode_STATE(unicode).interned = 0;
1173 _PyUnicode_STATE(unicode).kind = 0;
1174 _PyUnicode_STATE(unicode).compact = 0;
1175 _PyUnicode_STATE(unicode).ready = 0;
1176 _PyUnicode_STATE(unicode).ascii = 0;
1177 _PyUnicode_DATA_ANY(unicode) = NULL;
1178 _PyUnicode_LENGTH(unicode) = 0;
1179 _PyUnicode_UTF8(unicode) = NULL;
1180 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1181
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001182 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1183 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001184 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001185 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001186 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001187 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001188
Jeremy Hyltond8082792003-09-16 19:41:39 +00001189 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001190 * the caller fails before initializing str -- unicode_resize()
1191 * reads str[0], and the Keep-Alive optimization can keep memory
1192 * allocated for str alive across a call to unicode_dealloc(unicode).
1193 * We don't want unicode_resize to read uninitialized memory in
1194 * that case.
1195 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001196 _PyUnicode_WSTR(unicode)[0] = 0;
1197 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001198
Victor Stinner7931d9a2011-11-04 00:22:48 +01001199 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001200 return unicode;
1201}
1202
Victor Stinnerf42dc442011-10-02 23:33:16 +02001203static const char*
1204unicode_kind_name(PyObject *unicode)
1205{
Victor Stinner42dfd712011-10-03 14:41:45 +02001206 /* don't check consistency: unicode_kind_name() is called from
1207 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001208 if (!PyUnicode_IS_COMPACT(unicode))
1209 {
1210 if (!PyUnicode_IS_READY(unicode))
1211 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001212 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001213 {
1214 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001215 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001216 return "legacy ascii";
1217 else
1218 return "legacy latin1";
1219 case PyUnicode_2BYTE_KIND:
1220 return "legacy UCS2";
1221 case PyUnicode_4BYTE_KIND:
1222 return "legacy UCS4";
1223 default:
1224 return "<legacy invalid kind>";
1225 }
1226 }
1227 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001228 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001229 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001230 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001231 return "ascii";
1232 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001233 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001234 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001235 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001236 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001237 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001238 default:
1239 return "<invalid compact kind>";
1240 }
1241}
1242
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001243#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001244/* Functions wrapping macros for use in debugger */
Victor Stinnera42de742018-11-22 10:25:22 +01001245char *_PyUnicode_utf8(void *unicode_raw){
1246 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001247 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001248}
1249
Victor Stinnera42de742018-11-22 10:25:22 +01001250void *_PyUnicode_compact_data(void *unicode_raw) {
1251 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001252 return _PyUnicode_COMPACT_DATA(unicode);
1253}
Victor Stinnera42de742018-11-22 10:25:22 +01001254void *_PyUnicode_data(void *unicode_raw) {
1255 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001256 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001257 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1258 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1259 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1260 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1261 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1262 return PyUnicode_DATA(unicode);
1263}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001264
1265void
1266_PyUnicode_Dump(PyObject *op)
1267{
1268 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001269 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1270 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1271 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001272
Victor Stinnera849a4b2011-10-03 12:12:11 +02001273 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001274 {
1275 if (ascii->state.ascii)
1276 data = (ascii + 1);
1277 else
1278 data = (compact + 1);
1279 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001280 else
1281 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001282 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1283 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001284
Victor Stinnera849a4b2011-10-03 12:12:11 +02001285 if (ascii->wstr == data)
1286 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001287 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001288
Victor Stinnera3b334d2011-10-03 13:53:37 +02001289 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001290 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001291 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1292 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001293 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001294 (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001295 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001296 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001297}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001298#endif
1299
1300PyObject *
1301PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1302{
1303 PyObject *obj;
1304 PyCompactUnicodeObject *unicode;
1305 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001306 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001307 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001308 Py_ssize_t char_size;
1309 Py_ssize_t struct_size;
1310
1311 /* Optimization for empty strings */
1312 if (size == 0 && unicode_empty != NULL) {
1313 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001314 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315 }
1316
Victor Stinner9e9d6892011-10-04 01:02:02 +02001317 is_ascii = 0;
1318 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319 struct_size = sizeof(PyCompactUnicodeObject);
1320 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001321 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001322 char_size = 1;
1323 is_ascii = 1;
1324 struct_size = sizeof(PyASCIIObject);
1325 }
1326 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001327 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001328 char_size = 1;
1329 }
1330 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001331 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001332 char_size = 2;
1333 if (sizeof(wchar_t) == 2)
1334 is_sharing = 1;
1335 }
1336 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001337 if (maxchar > MAX_UNICODE) {
1338 PyErr_SetString(PyExc_SystemError,
1339 "invalid maximum character passed to PyUnicode_New");
1340 return NULL;
1341 }
Victor Stinner8f825062012-04-27 13:55:39 +02001342 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001343 char_size = 4;
1344 if (sizeof(wchar_t) == 4)
1345 is_sharing = 1;
1346 }
1347
1348 /* Ensure we won't overflow the size. */
1349 if (size < 0) {
1350 PyErr_SetString(PyExc_SystemError,
1351 "Negative size passed to PyUnicode_New");
1352 return NULL;
1353 }
1354 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1355 return PyErr_NoMemory();
1356
1357 /* Duplicated allocation code from _PyObject_New() instead of a call to
1358 * PyObject_New() so we are able to allocate space for the object and
1359 * it's data buffer.
1360 */
1361 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1362 if (obj == NULL)
1363 return PyErr_NoMemory();
1364 obj = PyObject_INIT(obj, &PyUnicode_Type);
1365 if (obj == NULL)
1366 return NULL;
1367
1368 unicode = (PyCompactUnicodeObject *)obj;
1369 if (is_ascii)
1370 data = ((PyASCIIObject*)obj) + 1;
1371 else
1372 data = unicode + 1;
1373 _PyUnicode_LENGTH(unicode) = size;
1374 _PyUnicode_HASH(unicode) = -1;
1375 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001376 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377 _PyUnicode_STATE(unicode).compact = 1;
1378 _PyUnicode_STATE(unicode).ready = 1;
1379 _PyUnicode_STATE(unicode).ascii = is_ascii;
1380 if (is_ascii) {
1381 ((char*)data)[size] = 0;
1382 _PyUnicode_WSTR(unicode) = NULL;
1383 }
Victor Stinner8f825062012-04-27 13:55:39 +02001384 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 ((char*)data)[size] = 0;
1386 _PyUnicode_WSTR(unicode) = NULL;
1387 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001388 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001389 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001390 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391 else {
1392 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001393 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001394 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001396 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001397 ((Py_UCS4*)data)[size] = 0;
1398 if (is_sharing) {
1399 _PyUnicode_WSTR_LENGTH(unicode) = size;
1400 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1401 }
1402 else {
1403 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1404 _PyUnicode_WSTR(unicode) = NULL;
1405 }
1406 }
Victor Stinner8f825062012-04-27 13:55:39 +02001407#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001408 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001409#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001410 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001411 return obj;
1412}
1413
1414#if SIZEOF_WCHAR_T == 2
1415/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1416 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001417 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001418
1419 This function assumes that unicode can hold one more code point than wstr
1420 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001421static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001423 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424{
1425 const wchar_t *iter;
1426 Py_UCS4 *ucs4_out;
1427
Victor Stinner910337b2011-10-03 03:20:16 +02001428 assert(unicode != NULL);
1429 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1431 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1432
1433 for (iter = begin; iter < end; ) {
1434 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1435 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001436 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1437 && (iter+1) < end
1438 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439 {
Victor Stinner551ac952011-11-29 22:58:13 +01001440 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001441 iter += 2;
1442 }
1443 else {
1444 *ucs4_out++ = *iter;
1445 iter++;
1446 }
1447 }
1448 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1449 _PyUnicode_GET_LENGTH(unicode)));
1450
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001451}
1452#endif
1453
Victor Stinnercd9950f2011-10-02 00:34:53 +02001454static int
Victor Stinner488fa492011-12-12 00:01:39 +01001455unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001456{
Victor Stinner488fa492011-12-12 00:01:39 +01001457 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001458 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001459 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001460 return -1;
1461 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001462 return 0;
1463}
1464
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001465static int
1466_copy_characters(PyObject *to, Py_ssize_t to_start,
1467 PyObject *from, Py_ssize_t from_start,
1468 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001469{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001470 unsigned int from_kind, to_kind;
1471 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472
Victor Stinneree4544c2012-05-09 22:24:08 +02001473 assert(0 <= how_many);
1474 assert(0 <= from_start);
1475 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001476 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001477 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001478 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001479
Victor Stinnerd3f08822012-05-29 12:57:52 +02001480 assert(PyUnicode_Check(to));
1481 assert(PyUnicode_IS_READY(to));
1482 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1483
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001484 if (how_many == 0)
1485 return 0;
1486
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001488 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001489 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001490 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001491
Victor Stinnerf1852262012-06-16 16:38:26 +02001492#ifdef Py_DEBUG
1493 if (!check_maxchar
1494 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1495 {
1496 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1497 Py_UCS4 ch;
1498 Py_ssize_t i;
1499 for (i=0; i < how_many; i++) {
1500 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1501 assert(ch <= to_maxchar);
1502 }
1503 }
1504#endif
1505
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001506 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001507 if (check_maxchar
1508 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1509 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001510 /* Writing Latin-1 characters into an ASCII string requires to
1511 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001512 Py_UCS4 max_char;
1513 max_char = ucs1lib_find_max_char(from_data,
1514 (Py_UCS1*)from_data + how_many);
1515 if (max_char >= 128)
1516 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001517 }
Christian Heimesf051e432016-09-13 20:22:02 +02001518 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001519 (char*)from_data + from_kind * from_start,
1520 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001521 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001522 else if (from_kind == PyUnicode_1BYTE_KIND
1523 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001524 {
1525 _PyUnicode_CONVERT_BYTES(
1526 Py_UCS1, Py_UCS2,
1527 PyUnicode_1BYTE_DATA(from) + from_start,
1528 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1529 PyUnicode_2BYTE_DATA(to) + to_start
1530 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001531 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001532 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001533 && to_kind == PyUnicode_4BYTE_KIND)
1534 {
1535 _PyUnicode_CONVERT_BYTES(
1536 Py_UCS1, Py_UCS4,
1537 PyUnicode_1BYTE_DATA(from) + from_start,
1538 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1539 PyUnicode_4BYTE_DATA(to) + to_start
1540 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001541 }
1542 else if (from_kind == PyUnicode_2BYTE_KIND
1543 && to_kind == PyUnicode_4BYTE_KIND)
1544 {
1545 _PyUnicode_CONVERT_BYTES(
1546 Py_UCS2, Py_UCS4,
1547 PyUnicode_2BYTE_DATA(from) + from_start,
1548 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1549 PyUnicode_4BYTE_DATA(to) + to_start
1550 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001551 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001552 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001553 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1554
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001555 if (!check_maxchar) {
1556 if (from_kind == PyUnicode_2BYTE_KIND
1557 && to_kind == PyUnicode_1BYTE_KIND)
1558 {
1559 _PyUnicode_CONVERT_BYTES(
1560 Py_UCS2, Py_UCS1,
1561 PyUnicode_2BYTE_DATA(from) + from_start,
1562 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1563 PyUnicode_1BYTE_DATA(to) + to_start
1564 );
1565 }
1566 else if (from_kind == PyUnicode_4BYTE_KIND
1567 && to_kind == PyUnicode_1BYTE_KIND)
1568 {
1569 _PyUnicode_CONVERT_BYTES(
1570 Py_UCS4, Py_UCS1,
1571 PyUnicode_4BYTE_DATA(from) + from_start,
1572 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1573 PyUnicode_1BYTE_DATA(to) + to_start
1574 );
1575 }
1576 else if (from_kind == PyUnicode_4BYTE_KIND
1577 && to_kind == PyUnicode_2BYTE_KIND)
1578 {
1579 _PyUnicode_CONVERT_BYTES(
1580 Py_UCS4, Py_UCS2,
1581 PyUnicode_4BYTE_DATA(from) + from_start,
1582 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1583 PyUnicode_2BYTE_DATA(to) + to_start
1584 );
1585 }
1586 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001587 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001588 }
1589 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001590 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001591 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001592 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001593 Py_ssize_t i;
1594
Victor Stinnera0702ab2011-09-29 14:14:38 +02001595 for (i=0; i < how_many; i++) {
1596 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001597 if (ch > to_maxchar)
1598 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001599 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1600 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001601 }
1602 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001603 return 0;
1604}
1605
Victor Stinnerd3f08822012-05-29 12:57:52 +02001606void
1607_PyUnicode_FastCopyCharacters(
1608 PyObject *to, Py_ssize_t to_start,
1609 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001610{
1611 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1612}
1613
1614Py_ssize_t
1615PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1616 PyObject *from, Py_ssize_t from_start,
1617 Py_ssize_t how_many)
1618{
1619 int err;
1620
1621 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1622 PyErr_BadInternalCall();
1623 return -1;
1624 }
1625
Benjamin Petersonbac79492012-01-14 13:34:47 -05001626 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001627 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001628 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001629 return -1;
1630
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001631 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001632 PyErr_SetString(PyExc_IndexError, "string index out of range");
1633 return -1;
1634 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001635 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001636 PyErr_SetString(PyExc_IndexError, "string index out of range");
1637 return -1;
1638 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001639 if (how_many < 0) {
1640 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1641 return -1;
1642 }
1643 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001644 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1645 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001646 "Cannot write %zi characters at %zi "
1647 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001648 how_many, to_start, PyUnicode_GET_LENGTH(to));
1649 return -1;
1650 }
1651
1652 if (how_many == 0)
1653 return 0;
1654
Victor Stinner488fa492011-12-12 00:01:39 +01001655 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001656 return -1;
1657
1658 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1659 if (err) {
1660 PyErr_Format(PyExc_SystemError,
1661 "Cannot copy %s characters "
1662 "into a string of %s characters",
1663 unicode_kind_name(from),
1664 unicode_kind_name(to));
1665 return -1;
1666 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001667 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001668}
1669
Victor Stinner17222162011-09-28 22:15:37 +02001670/* Find the maximum code point and count the number of surrogate pairs so a
1671 correct string length can be computed before converting a string to UCS4.
1672 This function counts single surrogates as a character and not as a pair.
1673
1674 Return 0 on success, or -1 on error. */
1675static int
1676find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1677 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001678{
1679 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001680 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001681
Victor Stinnerc53be962011-10-02 21:33:54 +02001682 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001683 *num_surrogates = 0;
1684 *maxchar = 0;
1685
1686 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001687#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001688 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1689 && (iter+1) < end
1690 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1691 {
1692 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1693 ++(*num_surrogates);
1694 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001695 }
1696 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001697#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001698 {
1699 ch = *iter;
1700 iter++;
1701 }
1702 if (ch > *maxchar) {
1703 *maxchar = ch;
1704 if (*maxchar > MAX_UNICODE) {
1705 PyErr_Format(PyExc_ValueError,
1706 "character U+%x is not in range [U+0000; U+10ffff]",
1707 ch);
1708 return -1;
1709 }
1710 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001711 }
1712 return 0;
1713}
1714
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001715int
1716_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001717{
1718 wchar_t *end;
1719 Py_UCS4 maxchar = 0;
1720 Py_ssize_t num_surrogates;
1721#if SIZEOF_WCHAR_T == 2
1722 Py_ssize_t length_wo_surrogates;
1723#endif
1724
Georg Brandl7597add2011-10-05 16:36:47 +02001725 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001726 strings were created using _PyObject_New() and where no canonical
1727 representation (the str field) has been set yet aka strings
1728 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001729 assert(_PyUnicode_CHECK(unicode));
1730 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001731 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001732 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001733 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001734 /* Actually, it should neither be interned nor be anything else: */
1735 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001737 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001738 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001739 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001740 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001741
1742 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001743 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1744 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001745 PyErr_NoMemory();
1746 return -1;
1747 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001748 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749 _PyUnicode_WSTR(unicode), end,
1750 PyUnicode_1BYTE_DATA(unicode));
1751 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1752 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1753 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1754 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001755 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001756 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001757 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001758 }
1759 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001760 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001761 _PyUnicode_UTF8(unicode) = NULL;
1762 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763 }
1764 PyObject_FREE(_PyUnicode_WSTR(unicode));
1765 _PyUnicode_WSTR(unicode) = NULL;
1766 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1767 }
1768 /* In this case we might have to convert down from 4-byte native
1769 wchar_t to 2-byte unicode. */
1770 else if (maxchar < 65536) {
1771 assert(num_surrogates == 0 &&
1772 "FindMaxCharAndNumSurrogatePairs() messed up");
1773
Victor Stinner506f5922011-09-28 22:34:18 +02001774#if SIZEOF_WCHAR_T == 2
1775 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001776 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001777 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1778 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1779 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001780 _PyUnicode_UTF8(unicode) = NULL;
1781 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001782#else
1783 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001784 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001785 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001786 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001787 PyErr_NoMemory();
1788 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789 }
Victor Stinner506f5922011-09-28 22:34:18 +02001790 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1791 _PyUnicode_WSTR(unicode), end,
1792 PyUnicode_2BYTE_DATA(unicode));
1793 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1794 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1795 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001796 _PyUnicode_UTF8(unicode) = NULL;
1797 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001798 PyObject_FREE(_PyUnicode_WSTR(unicode));
1799 _PyUnicode_WSTR(unicode) = NULL;
1800 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1801#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001802 }
1803 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1804 else {
1805#if SIZEOF_WCHAR_T == 2
1806 /* in case the native representation is 2-bytes, we need to allocate a
1807 new normalized 4-byte version. */
1808 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001809 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1810 PyErr_NoMemory();
1811 return -1;
1812 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001813 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1814 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001815 PyErr_NoMemory();
1816 return -1;
1817 }
1818 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1819 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001820 _PyUnicode_UTF8(unicode) = NULL;
1821 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001822 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1823 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001824 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001825 PyObject_FREE(_PyUnicode_WSTR(unicode));
1826 _PyUnicode_WSTR(unicode) = NULL;
1827 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1828#else
1829 assert(num_surrogates == 0);
1830
Victor Stinnerc3c74152011-10-02 20:39:55 +02001831 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001832 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001833 _PyUnicode_UTF8(unicode) = NULL;
1834 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1836#endif
1837 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1838 }
1839 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001840 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001841 return 0;
1842}
1843
Alexander Belopolsky40018472011-02-26 01:02:56 +00001844static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001845unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846{
Walter Dörwald16807132007-05-25 13:52:07 +00001847 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001848 case SSTATE_NOT_INTERNED:
1849 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001850
Benjamin Peterson29060642009-01-31 22:14:21 +00001851 case SSTATE_INTERNED_MORTAL:
1852 /* revive dead object temporarily for DelItem */
1853 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001854 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001855 Py_FatalError(
1856 "deletion of interned string failed");
1857 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001858
Benjamin Peterson29060642009-01-31 22:14:21 +00001859 case SSTATE_INTERNED_IMMORTAL:
1860 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001861 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001862
Benjamin Peterson29060642009-01-31 22:14:21 +00001863 default:
1864 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001865 }
1866
Victor Stinner03490912011-10-03 23:45:12 +02001867 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001868 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001869 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001870 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001871 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1872 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001873
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001874 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001875}
1876
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001877#ifdef Py_DEBUG
1878static int
1879unicode_is_singleton(PyObject *unicode)
1880{
1881 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1882 if (unicode == unicode_empty)
1883 return 1;
1884 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1885 {
1886 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1887 if (ch < 256 && unicode_latin1[ch] == unicode)
1888 return 1;
1889 }
1890 return 0;
1891}
1892#endif
1893
Alexander Belopolsky40018472011-02-26 01:02:56 +00001894static int
Victor Stinner488fa492011-12-12 00:01:39 +01001895unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001896{
Victor Stinner488fa492011-12-12 00:01:39 +01001897 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001898 if (Py_REFCNT(unicode) != 1)
1899 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001900 if (_PyUnicode_HASH(unicode) != -1)
1901 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001902 if (PyUnicode_CHECK_INTERNED(unicode))
1903 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001904 if (!PyUnicode_CheckExact(unicode))
1905 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001906#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001907 /* singleton refcount is greater than 1 */
1908 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001909#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001910 return 1;
1911}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001912
Victor Stinnerfe226c02011-10-03 03:52:20 +02001913static int
1914unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1915{
1916 PyObject *unicode;
1917 Py_ssize_t old_length;
1918
1919 assert(p_unicode != NULL);
1920 unicode = *p_unicode;
1921
1922 assert(unicode != NULL);
1923 assert(PyUnicode_Check(unicode));
1924 assert(0 <= length);
1925
Victor Stinner910337b2011-10-03 03:20:16 +02001926 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001927 old_length = PyUnicode_WSTR_LENGTH(unicode);
1928 else
1929 old_length = PyUnicode_GET_LENGTH(unicode);
1930 if (old_length == length)
1931 return 0;
1932
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001933 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001934 _Py_INCREF_UNICODE_EMPTY();
1935 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001936 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001937 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001938 return 0;
1939 }
1940
Victor Stinner488fa492011-12-12 00:01:39 +01001941 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001942 PyObject *copy = resize_copy(unicode, length);
1943 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001944 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001945 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001946 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001947 }
1948
Victor Stinnerfe226c02011-10-03 03:52:20 +02001949 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001950 PyObject *new_unicode = resize_compact(unicode, length);
1951 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001952 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001953 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001954 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001955 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001956 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001957}
1958
Alexander Belopolsky40018472011-02-26 01:02:56 +00001959int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001960PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001961{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001962 PyObject *unicode;
1963 if (p_unicode == NULL) {
1964 PyErr_BadInternalCall();
1965 return -1;
1966 }
1967 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001968 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001969 {
1970 PyErr_BadInternalCall();
1971 return -1;
1972 }
1973 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001974}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001975
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001976/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001977
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001978 WARNING: The function doesn't copy the terminating null character and
1979 doesn't check the maximum character (may write a latin1 character in an
1980 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001981static void
1982unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1983 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001984{
1985 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1986 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001987 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001988
1989 switch (kind) {
1990 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001991 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001992#ifdef Py_DEBUG
1993 if (PyUnicode_IS_ASCII(unicode)) {
1994 Py_UCS4 maxchar = ucs1lib_find_max_char(
1995 (const Py_UCS1*)str,
1996 (const Py_UCS1*)str + len);
1997 assert(maxchar < 128);
1998 }
1999#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002000 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002001 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002002 }
2003 case PyUnicode_2BYTE_KIND: {
2004 Py_UCS2 *start = (Py_UCS2 *)data + index;
2005 Py_UCS2 *ucs2 = start;
2006 assert(index <= PyUnicode_GET_LENGTH(unicode));
2007
Victor Stinner184252a2012-06-16 02:57:41 +02002008 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002009 *ucs2 = (Py_UCS2)*str;
2010
2011 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002012 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002013 }
2014 default: {
2015 Py_UCS4 *start = (Py_UCS4 *)data + index;
2016 Py_UCS4 *ucs4 = start;
2017 assert(kind == PyUnicode_4BYTE_KIND);
2018 assert(index <= PyUnicode_GET_LENGTH(unicode));
2019
Victor Stinner184252a2012-06-16 02:57:41 +02002020 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002021 *ucs4 = (Py_UCS4)*str;
2022
2023 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002024 }
2025 }
2026}
2027
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002028static PyObject*
2029get_latin1_char(unsigned char ch)
2030{
Victor Stinnera464fc12011-10-02 20:39:30 +02002031 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002032 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02002033 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002034 if (!unicode)
2035 return NULL;
2036 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002037 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002038 unicode_latin1[ch] = unicode;
2039 }
2040 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002041 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002042}
2043
Victor Stinner985a82a2014-01-03 12:53:47 +01002044static PyObject*
2045unicode_char(Py_UCS4 ch)
2046{
2047 PyObject *unicode;
2048
2049 assert(ch <= MAX_UNICODE);
2050
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002051 if (ch < 256)
2052 return get_latin1_char(ch);
2053
Victor Stinner985a82a2014-01-03 12:53:47 +01002054 unicode = PyUnicode_New(1, ch);
2055 if (unicode == NULL)
2056 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002057
2058 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2059 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002060 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002061 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002062 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2063 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2064 }
2065 assert(_PyUnicode_CheckConsistency(unicode, 1));
2066 return unicode;
2067}
2068
Alexander Belopolsky40018472011-02-26 01:02:56 +00002069PyObject *
2070PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002071{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002072 if (u == NULL)
2073 return (PyObject*)_PyUnicode_New(size);
2074
2075 if (size < 0) {
2076 PyErr_BadInternalCall();
2077 return NULL;
2078 }
2079
2080 return PyUnicode_FromWideChar(u, size);
2081}
2082
2083PyObject *
2084PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2085{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002086 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002087 Py_UCS4 maxchar = 0;
2088 Py_ssize_t num_surrogates;
2089
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002090 if (u == NULL && size != 0) {
2091 PyErr_BadInternalCall();
2092 return NULL;
2093 }
2094
2095 if (size == -1) {
2096 size = wcslen(u);
2097 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002098
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002099 /* If the Unicode data is known at construction time, we can apply
2100 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002101
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002102 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002103 if (size == 0)
2104 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002106 /* Single character Unicode objects in the Latin-1 range are
2107 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002108 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002109 return get_latin1_char((unsigned char)*u);
2110
2111 /* If not empty and not single character, copy the Unicode data
2112 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002113 if (find_maxchar_surrogates(u, u + size,
2114 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002115 return NULL;
2116
Victor Stinner8faf8212011-12-08 22:14:11 +01002117 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002118 if (!unicode)
2119 return NULL;
2120
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002121 switch (PyUnicode_KIND(unicode)) {
2122 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002123 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002124 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2125 break;
2126 case PyUnicode_2BYTE_KIND:
2127#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002128 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002130 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002131 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2132#endif
2133 break;
2134 case PyUnicode_4BYTE_KIND:
2135#if SIZEOF_WCHAR_T == 2
2136 /* This is the only case which has to process surrogates, thus
2137 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002138 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002139#else
2140 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002141 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002142#endif
2143 break;
2144 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002145 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002146 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002147
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002148 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002149}
2150
Alexander Belopolsky40018472011-02-26 01:02:56 +00002151PyObject *
2152PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002153{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002154 if (size < 0) {
2155 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002156 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002157 return NULL;
2158 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002159 if (u != NULL)
2160 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2161 else
2162 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002163}
2164
Alexander Belopolsky40018472011-02-26 01:02:56 +00002165PyObject *
2166PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002167{
2168 size_t size = strlen(u);
2169 if (size > PY_SSIZE_T_MAX) {
2170 PyErr_SetString(PyExc_OverflowError, "input too long");
2171 return NULL;
2172 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002173 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002174}
2175
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002176PyObject *
2177_PyUnicode_FromId(_Py_Identifier *id)
2178{
2179 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002180 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2181 strlen(id->string),
2182 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002183 if (!id->object)
2184 return NULL;
2185 PyUnicode_InternInPlace(&id->object);
2186 assert(!id->next);
2187 id->next = static_strings;
2188 static_strings = id;
2189 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002190 return id->object;
2191}
2192
2193void
2194_PyUnicode_ClearStaticStrings()
2195{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002196 _Py_Identifier *tmp, *s = static_strings;
2197 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002198 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002199 tmp = s->next;
2200 s->next = NULL;
2201 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002202 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002203 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002204}
2205
Benjamin Peterson0df54292012-03-26 14:50:32 -04002206/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002207
Victor Stinnerd3f08822012-05-29 12:57:52 +02002208PyObject*
2209_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002210{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002211 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002212 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002213 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002214#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002215 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002216#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002217 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002218 }
Victor Stinner785938e2011-12-11 20:09:03 +01002219 unicode = PyUnicode_New(size, 127);
2220 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002221 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002222 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2223 assert(_PyUnicode_CheckConsistency(unicode, 1));
2224 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002225}
2226
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002227static Py_UCS4
2228kind_maxchar_limit(unsigned int kind)
2229{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002230 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002231 case PyUnicode_1BYTE_KIND:
2232 return 0x80;
2233 case PyUnicode_2BYTE_KIND:
2234 return 0x100;
2235 case PyUnicode_4BYTE_KIND:
2236 return 0x10000;
2237 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002238 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002239 }
2240}
2241
Victor Stinner702c7342011-10-05 13:50:52 +02002242static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002243_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002244{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002245 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002246 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002247
Serhiy Storchaka678db842013-01-26 12:16:36 +02002248 if (size == 0)
2249 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002250 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002251 if (size == 1)
2252 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002253
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002254 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002255 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002256 if (!res)
2257 return NULL;
2258 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002259 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002260 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002261}
2262
Victor Stinnere57b1c02011-09-28 22:20:48 +02002263static PyObject*
2264_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002265{
2266 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002267 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002268
Serhiy Storchaka678db842013-01-26 12:16:36 +02002269 if (size == 0)
2270 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002271 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002272 if (size == 1)
2273 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002274
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002275 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002276 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002277 if (!res)
2278 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002279 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002280 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002281 else {
2282 _PyUnicode_CONVERT_BYTES(
2283 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2284 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002285 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002286 return res;
2287}
2288
Victor Stinnere57b1c02011-09-28 22:20:48 +02002289static PyObject*
2290_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002291{
2292 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002293 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002294
Serhiy Storchaka678db842013-01-26 12:16:36 +02002295 if (size == 0)
2296 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002297 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002298 if (size == 1)
2299 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002300
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002301 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002302 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002303 if (!res)
2304 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002305 if (max_char < 256)
2306 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2307 PyUnicode_1BYTE_DATA(res));
2308 else if (max_char < 0x10000)
2309 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2310 PyUnicode_2BYTE_DATA(res));
2311 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002312 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002313 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002314 return res;
2315}
2316
2317PyObject*
2318PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2319{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002320 if (size < 0) {
2321 PyErr_SetString(PyExc_ValueError, "size must be positive");
2322 return NULL;
2323 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002324 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002325 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002326 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002327 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002328 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002329 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002330 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002331 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002332 PyErr_SetString(PyExc_SystemError, "invalid kind");
2333 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002334 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002335}
2336
Victor Stinnerece58de2012-04-23 23:36:38 +02002337Py_UCS4
2338_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2339{
2340 enum PyUnicode_Kind kind;
2341 void *startptr, *endptr;
2342
2343 assert(PyUnicode_IS_READY(unicode));
2344 assert(0 <= start);
2345 assert(end <= PyUnicode_GET_LENGTH(unicode));
2346 assert(start <= end);
2347
2348 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2349 return PyUnicode_MAX_CHAR_VALUE(unicode);
2350
2351 if (start == end)
2352 return 127;
2353
Victor Stinner94d558b2012-04-27 22:26:58 +02002354 if (PyUnicode_IS_ASCII(unicode))
2355 return 127;
2356
Victor Stinnerece58de2012-04-23 23:36:38 +02002357 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002358 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002359 endptr = (char *)startptr + end * kind;
2360 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002361 switch(kind) {
2362 case PyUnicode_1BYTE_KIND:
2363 return ucs1lib_find_max_char(startptr, endptr);
2364 case PyUnicode_2BYTE_KIND:
2365 return ucs2lib_find_max_char(startptr, endptr);
2366 case PyUnicode_4BYTE_KIND:
2367 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002368 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002369 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002370 }
2371}
2372
Victor Stinner25a4b292011-10-06 12:31:55 +02002373/* Ensure that a string uses the most efficient storage, if it is not the
2374 case: create a new string with of the right kind. Write NULL into *p_unicode
2375 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002376static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002377unicode_adjust_maxchar(PyObject **p_unicode)
2378{
2379 PyObject *unicode, *copy;
2380 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002381 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002382 unsigned int kind;
2383
2384 assert(p_unicode != NULL);
2385 unicode = *p_unicode;
2386 assert(PyUnicode_IS_READY(unicode));
2387 if (PyUnicode_IS_ASCII(unicode))
2388 return;
2389
2390 len = PyUnicode_GET_LENGTH(unicode);
2391 kind = PyUnicode_KIND(unicode);
2392 if (kind == PyUnicode_1BYTE_KIND) {
2393 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002394 max_char = ucs1lib_find_max_char(u, u + len);
2395 if (max_char >= 128)
2396 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002397 }
2398 else if (kind == PyUnicode_2BYTE_KIND) {
2399 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002400 max_char = ucs2lib_find_max_char(u, u + len);
2401 if (max_char >= 256)
2402 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002403 }
2404 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002405 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002406 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002407 max_char = ucs4lib_find_max_char(u, u + len);
2408 if (max_char >= 0x10000)
2409 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002410 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002411 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002412 if (copy != NULL)
2413 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002414 Py_DECREF(unicode);
2415 *p_unicode = copy;
2416}
2417
Victor Stinner034f6cf2011-09-30 02:26:44 +02002418PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002419_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002420{
Victor Stinner87af4f22011-11-21 23:03:47 +01002421 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002422 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002423
Victor Stinner034f6cf2011-09-30 02:26:44 +02002424 if (!PyUnicode_Check(unicode)) {
2425 PyErr_BadInternalCall();
2426 return NULL;
2427 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002428 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002429 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002430
Victor Stinner87af4f22011-11-21 23:03:47 +01002431 length = PyUnicode_GET_LENGTH(unicode);
2432 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002433 if (!copy)
2434 return NULL;
2435 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2436
Christian Heimesf051e432016-09-13 20:22:02 +02002437 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002438 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002439 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002440 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002441}
2442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002443
Victor Stinnerbc603d12011-10-02 01:00:40 +02002444/* Widen Unicode objects to larger buffers. Don't write terminating null
2445 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002446
2447void*
2448_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2449{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002450 Py_ssize_t len;
2451 void *result;
2452 unsigned int skind;
2453
Benjamin Petersonbac79492012-01-14 13:34:47 -05002454 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002455 return NULL;
2456
2457 len = PyUnicode_GET_LENGTH(s);
2458 skind = PyUnicode_KIND(s);
2459 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002460 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002461 return NULL;
2462 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002463 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002464 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002465 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002466 if (!result)
2467 return PyErr_NoMemory();
2468 assert(skind == PyUnicode_1BYTE_KIND);
2469 _PyUnicode_CONVERT_BYTES(
2470 Py_UCS1, Py_UCS2,
2471 PyUnicode_1BYTE_DATA(s),
2472 PyUnicode_1BYTE_DATA(s) + len,
2473 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002474 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002475 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002476 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002477 if (!result)
2478 return PyErr_NoMemory();
2479 if (skind == PyUnicode_2BYTE_KIND) {
2480 _PyUnicode_CONVERT_BYTES(
2481 Py_UCS2, Py_UCS4,
2482 PyUnicode_2BYTE_DATA(s),
2483 PyUnicode_2BYTE_DATA(s) + len,
2484 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002485 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002486 else {
2487 assert(skind == PyUnicode_1BYTE_KIND);
2488 _PyUnicode_CONVERT_BYTES(
2489 Py_UCS1, Py_UCS4,
2490 PyUnicode_1BYTE_DATA(s),
2491 PyUnicode_1BYTE_DATA(s) + len,
2492 result);
2493 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002494 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002495 default:
2496 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002497 }
Victor Stinner01698042011-10-04 00:04:26 +02002498 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002499 return NULL;
2500}
2501
2502static Py_UCS4*
2503as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2504 int copy_null)
2505{
2506 int kind;
2507 void *data;
2508 Py_ssize_t len, targetlen;
2509 if (PyUnicode_READY(string) == -1)
2510 return NULL;
2511 kind = PyUnicode_KIND(string);
2512 data = PyUnicode_DATA(string);
2513 len = PyUnicode_GET_LENGTH(string);
2514 targetlen = len;
2515 if (copy_null)
2516 targetlen++;
2517 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002518 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002519 if (!target) {
2520 PyErr_NoMemory();
2521 return NULL;
2522 }
2523 }
2524 else {
2525 if (targetsize < targetlen) {
2526 PyErr_Format(PyExc_SystemError,
2527 "string is longer than the buffer");
2528 if (copy_null && 0 < targetsize)
2529 target[0] = 0;
2530 return NULL;
2531 }
2532 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002533 if (kind == PyUnicode_1BYTE_KIND) {
2534 Py_UCS1 *start = (Py_UCS1 *) data;
2535 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002536 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002537 else if (kind == PyUnicode_2BYTE_KIND) {
2538 Py_UCS2 *start = (Py_UCS2 *) data;
2539 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2540 }
2541 else {
2542 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002543 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002544 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002545 if (copy_null)
2546 target[len] = 0;
2547 return target;
2548}
2549
2550Py_UCS4*
2551PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2552 int copy_null)
2553{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002554 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002555 PyErr_BadInternalCall();
2556 return NULL;
2557 }
2558 return as_ucs4(string, target, targetsize, copy_null);
2559}
2560
2561Py_UCS4*
2562PyUnicode_AsUCS4Copy(PyObject *string)
2563{
2564 return as_ucs4(string, NULL, 0, 1);
2565}
2566
Victor Stinner15a11362012-10-06 23:48:20 +02002567/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002568 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2569 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2570#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002571
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002572static int
2573unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2574 Py_ssize_t width, Py_ssize_t precision)
2575{
2576 Py_ssize_t length, fill, arglen;
2577 Py_UCS4 maxchar;
2578
2579 if (PyUnicode_READY(str) == -1)
2580 return -1;
2581
2582 length = PyUnicode_GET_LENGTH(str);
2583 if ((precision == -1 || precision >= length)
2584 && width <= length)
2585 return _PyUnicodeWriter_WriteStr(writer, str);
2586
2587 if (precision != -1)
2588 length = Py_MIN(precision, length);
2589
2590 arglen = Py_MAX(length, width);
2591 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2592 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2593 else
2594 maxchar = writer->maxchar;
2595
2596 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2597 return -1;
2598
2599 if (width > length) {
2600 fill = width - length;
2601 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2602 return -1;
2603 writer->pos += fill;
2604 }
2605
2606 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2607 str, 0, length);
2608 writer->pos += length;
2609 return 0;
2610}
2611
2612static int
Victor Stinner998b8062018-09-12 00:23:25 +02002613unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002614 Py_ssize_t width, Py_ssize_t precision)
2615{
2616 /* UTF-8 */
2617 Py_ssize_t length;
2618 PyObject *unicode;
2619 int res;
2620
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002621 if (precision == -1) {
2622 length = strlen(str);
2623 }
2624 else {
2625 length = 0;
2626 while (length < precision && str[length]) {
2627 length++;
2628 }
2629 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002630 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2631 if (unicode == NULL)
2632 return -1;
2633
2634 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2635 Py_DECREF(unicode);
2636 return res;
2637}
2638
Victor Stinner96865452011-03-01 23:44:09 +00002639static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002640unicode_fromformat_arg(_PyUnicodeWriter *writer,
2641 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002642{
Victor Stinnere215d962012-10-06 23:03:36 +02002643 const char *p;
2644 Py_ssize_t len;
2645 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002646 Py_ssize_t width;
2647 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002648 int longflag;
2649 int longlongflag;
2650 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002651 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002652
2653 p = f;
2654 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002655 zeropad = 0;
2656 if (*f == '0') {
2657 zeropad = 1;
2658 f++;
2659 }
Victor Stinner96865452011-03-01 23:44:09 +00002660
2661 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002662 width = -1;
2663 if (Py_ISDIGIT((unsigned)*f)) {
2664 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002665 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002666 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002667 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002668 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002669 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002670 return NULL;
2671 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002672 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002673 f++;
2674 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002675 }
2676 precision = -1;
2677 if (*f == '.') {
2678 f++;
2679 if (Py_ISDIGIT((unsigned)*f)) {
2680 precision = (*f - '0');
2681 f++;
2682 while (Py_ISDIGIT((unsigned)*f)) {
2683 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2684 PyErr_SetString(PyExc_ValueError,
2685 "precision too big");
2686 return NULL;
2687 }
2688 precision = (precision * 10) + (*f - '0');
2689 f++;
2690 }
2691 }
Victor Stinner96865452011-03-01 23:44:09 +00002692 if (*f == '%') {
2693 /* "%.3%s" => f points to "3" */
2694 f--;
2695 }
2696 }
2697 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002698 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002699 f--;
2700 }
Victor Stinner96865452011-03-01 23:44:09 +00002701
2702 /* Handle %ld, %lu, %lld and %llu. */
2703 longflag = 0;
2704 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002705 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002706 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002707 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002708 longflag = 1;
2709 ++f;
2710 }
Victor Stinner96865452011-03-01 23:44:09 +00002711 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002712 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002713 longlongflag = 1;
2714 f += 2;
2715 }
Victor Stinner96865452011-03-01 23:44:09 +00002716 }
2717 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002718 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002719 size_tflag = 1;
2720 ++f;
2721 }
Victor Stinnere215d962012-10-06 23:03:36 +02002722
2723 if (f[1] == '\0')
2724 writer->overallocate = 0;
2725
2726 switch (*f) {
2727 case 'c':
2728 {
2729 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002730 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002731 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002732 "character argument not in range(0x110000)");
2733 return NULL;
2734 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002735 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002736 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002737 break;
2738 }
2739
2740 case 'i':
2741 case 'd':
2742 case 'u':
2743 case 'x':
2744 {
2745 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002746 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002747 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002748
2749 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002750 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002751 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002752 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002753 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002754 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002755 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002756 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002757 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002758 va_arg(*vargs, size_t));
2759 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002760 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002761 va_arg(*vargs, unsigned int));
2762 }
2763 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002764 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002765 }
2766 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002767 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002768 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002769 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002770 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002771 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002772 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002773 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002774 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002775 va_arg(*vargs, Py_ssize_t));
2776 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002777 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002778 va_arg(*vargs, int));
2779 }
2780 assert(len >= 0);
2781
Victor Stinnere215d962012-10-06 23:03:36 +02002782 if (precision < len)
2783 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002784
2785 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002786 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2787 return NULL;
2788
Victor Stinnere215d962012-10-06 23:03:36 +02002789 if (width > precision) {
2790 Py_UCS4 fillchar;
2791 fill = width - precision;
2792 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002793 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2794 return NULL;
2795 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002796 }
Victor Stinner15a11362012-10-06 23:48:20 +02002797 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002798 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002799 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2800 return NULL;
2801 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002802 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002803
Victor Stinner4a587072013-11-19 12:54:53 +01002804 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2805 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002806 break;
2807 }
2808
2809 case 'p':
2810 {
2811 char number[MAX_LONG_LONG_CHARS];
2812
2813 len = sprintf(number, "%p", va_arg(*vargs, void*));
2814 assert(len >= 0);
2815
2816 /* %p is ill-defined: ensure leading 0x. */
2817 if (number[1] == 'X')
2818 number[1] = 'x';
2819 else if (number[1] != 'x') {
2820 memmove(number + 2, number,
2821 strlen(number) + 1);
2822 number[0] = '0';
2823 number[1] = 'x';
2824 len += 2;
2825 }
2826
Victor Stinner4a587072013-11-19 12:54:53 +01002827 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002828 return NULL;
2829 break;
2830 }
2831
2832 case 's':
2833 {
2834 /* UTF-8 */
2835 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002836 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002837 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002838 break;
2839 }
2840
2841 case 'U':
2842 {
2843 PyObject *obj = va_arg(*vargs, PyObject *);
2844 assert(obj && _PyUnicode_CHECK(obj));
2845
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002846 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002847 return NULL;
2848 break;
2849 }
2850
2851 case 'V':
2852 {
2853 PyObject *obj = va_arg(*vargs, PyObject *);
2854 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002855 if (obj) {
2856 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002857 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002858 return NULL;
2859 }
2860 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002861 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002862 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002863 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002864 }
2865 break;
2866 }
2867
2868 case 'S':
2869 {
2870 PyObject *obj = va_arg(*vargs, PyObject *);
2871 PyObject *str;
2872 assert(obj);
2873 str = PyObject_Str(obj);
2874 if (!str)
2875 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002876 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002877 Py_DECREF(str);
2878 return NULL;
2879 }
2880 Py_DECREF(str);
2881 break;
2882 }
2883
2884 case 'R':
2885 {
2886 PyObject *obj = va_arg(*vargs, PyObject *);
2887 PyObject *repr;
2888 assert(obj);
2889 repr = PyObject_Repr(obj);
2890 if (!repr)
2891 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002892 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002893 Py_DECREF(repr);
2894 return NULL;
2895 }
2896 Py_DECREF(repr);
2897 break;
2898 }
2899
2900 case 'A':
2901 {
2902 PyObject *obj = va_arg(*vargs, PyObject *);
2903 PyObject *ascii;
2904 assert(obj);
2905 ascii = PyObject_ASCII(obj);
2906 if (!ascii)
2907 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002908 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002909 Py_DECREF(ascii);
2910 return NULL;
2911 }
2912 Py_DECREF(ascii);
2913 break;
2914 }
2915
2916 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002917 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002918 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002919 break;
2920
2921 default:
2922 /* if we stumble upon an unknown formatting code, copy the rest
2923 of the format string to the output string. (we cannot just
2924 skip the code, since there's no way to know what's in the
2925 argument list) */
2926 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002927 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002928 return NULL;
2929 f = p+len;
2930 return f;
2931 }
2932
2933 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002934 return f;
2935}
2936
Walter Dörwaldd2034312007-05-18 16:29:38 +00002937PyObject *
2938PyUnicode_FromFormatV(const char *format, va_list vargs)
2939{
Victor Stinnere215d962012-10-06 23:03:36 +02002940 va_list vargs2;
2941 const char *f;
2942 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002943
Victor Stinner8f674cc2013-04-17 23:02:17 +02002944 _PyUnicodeWriter_Init(&writer);
2945 writer.min_length = strlen(format) + 100;
2946 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002947
Benjamin Peterson0c212142016-09-20 20:39:33 -07002948 // Copy varags to be able to pass a reference to a subfunction.
2949 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002950
2951 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002952 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002953 f = unicode_fromformat_arg(&writer, f, &vargs2);
2954 if (f == NULL)
2955 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002956 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002957 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002958 const char *p;
2959 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002960
Victor Stinnere215d962012-10-06 23:03:36 +02002961 p = f;
2962 do
2963 {
2964 if ((unsigned char)*p > 127) {
2965 PyErr_Format(PyExc_ValueError,
2966 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2967 "string, got a non-ASCII byte: 0x%02x",
2968 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002969 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002970 }
2971 p++;
2972 }
2973 while (*p != '\0' && *p != '%');
2974 len = p - f;
2975
2976 if (*p == '\0')
2977 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002978
2979 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002980 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002981
2982 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002983 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002984 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002985 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002986 return _PyUnicodeWriter_Finish(&writer);
2987
2988 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002989 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002990 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002991 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002992}
2993
Walter Dörwaldd2034312007-05-18 16:29:38 +00002994PyObject *
2995PyUnicode_FromFormat(const char *format, ...)
2996{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002997 PyObject* ret;
2998 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002999
3000#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003001 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003002#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003003 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003004#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003005 ret = PyUnicode_FromFormatV(format, vargs);
3006 va_end(vargs);
3007 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003008}
3009
Serhiy Storchakac46db922018-10-23 22:58:24 +03003010static Py_ssize_t
3011unicode_get_widechar_size(PyObject *unicode)
3012{
3013 Py_ssize_t res;
3014
3015 assert(unicode != NULL);
3016 assert(_PyUnicode_CHECK(unicode));
3017
3018 if (_PyUnicode_WSTR(unicode) != NULL) {
3019 return PyUnicode_WSTR_LENGTH(unicode);
3020 }
3021 assert(PyUnicode_IS_READY(unicode));
3022
3023 res = _PyUnicode_LENGTH(unicode);
3024#if SIZEOF_WCHAR_T == 2
3025 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3026 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3027 const Py_UCS4 *end = s + res;
3028 for (; s < end; ++s) {
3029 if (*s > 0xFFFF) {
3030 ++res;
3031 }
3032 }
3033 }
3034#endif
3035 return res;
3036}
3037
3038static void
3039unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3040{
3041 const wchar_t *wstr;
3042
3043 assert(unicode != NULL);
3044 assert(_PyUnicode_CHECK(unicode));
3045
3046 wstr = _PyUnicode_WSTR(unicode);
3047 if (wstr != NULL) {
3048 memcpy(w, wstr, size * sizeof(wchar_t));
3049 return;
3050 }
3051 assert(PyUnicode_IS_READY(unicode));
3052
3053 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3054 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3055 for (; size--; ++s, ++w) {
3056 *w = *s;
3057 }
3058 }
3059 else {
3060#if SIZEOF_WCHAR_T == 4
3061 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3062 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3063 for (; size--; ++s, ++w) {
3064 *w = *s;
3065 }
3066#else
3067 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3068 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3069 for (; size--; ++s, ++w) {
3070 Py_UCS4 ch = *s;
3071 if (ch > 0xFFFF) {
3072 assert(ch <= MAX_UNICODE);
3073 /* encode surrogate pair in this case */
3074 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3075 if (!size--)
3076 break;
3077 *w = Py_UNICODE_LOW_SURROGATE(ch);
3078 }
3079 else {
3080 *w = ch;
3081 }
3082 }
3083#endif
3084 }
3085}
3086
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003087#ifdef HAVE_WCHAR_H
3088
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003089/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003090
Victor Stinnerd88d9832011-09-06 02:00:05 +02003091 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003092 character) required to convert the unicode object. Ignore size argument.
3093
Victor Stinnerd88d9832011-09-06 02:00:05 +02003094 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003095 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003096 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003097Py_ssize_t
3098PyUnicode_AsWideChar(PyObject *unicode,
3099 wchar_t *w,
3100 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003101{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003102 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003103
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003104 if (unicode == NULL) {
3105 PyErr_BadInternalCall();
3106 return -1;
3107 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003108 if (!PyUnicode_Check(unicode)) {
3109 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003110 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003111 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003112
3113 res = unicode_get_widechar_size(unicode);
3114 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003115 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003116 }
3117
3118 if (size > res) {
3119 size = res + 1;
3120 }
3121 else {
3122 res = size;
3123 }
3124 unicode_copy_as_widechar(unicode, w, size);
3125 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003126}
3127
Victor Stinner137c34c2010-09-29 10:25:54 +00003128wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003129PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003130 Py_ssize_t *size)
3131{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003132 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003133 Py_ssize_t buflen;
3134
3135 if (unicode == NULL) {
3136 PyErr_BadInternalCall();
3137 return NULL;
3138 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003139 if (!PyUnicode_Check(unicode)) {
3140 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003141 return NULL;
3142 }
3143
Serhiy Storchakac46db922018-10-23 22:58:24 +03003144 buflen = unicode_get_widechar_size(unicode);
3145 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003146 if (buffer == NULL) {
3147 PyErr_NoMemory();
3148 return NULL;
3149 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003150 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3151 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003152 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003153 }
3154 else if (wcslen(buffer) != (size_t)buflen) {
3155 PyMem_FREE(buffer);
3156 PyErr_SetString(PyExc_ValueError,
3157 "embedded null character");
3158 return NULL;
3159 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003160 return buffer;
3161}
3162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003163#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003164
Alexander Belopolsky40018472011-02-26 01:02:56 +00003165PyObject *
3166PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003167{
Victor Stinner8faf8212011-12-08 22:14:11 +01003168 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003169 PyErr_SetString(PyExc_ValueError,
3170 "chr() arg not in range(0x110000)");
3171 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003172 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003173
Victor Stinner985a82a2014-01-03 12:53:47 +01003174 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003175}
3176
Alexander Belopolsky40018472011-02-26 01:02:56 +00003177PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003178PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003179{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003180 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003181 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003182 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003183 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003184 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003185 Py_INCREF(obj);
3186 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003187 }
3188 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003189 /* For a Unicode subtype that's not a Unicode object,
3190 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003191 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003192 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003193 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003194 "Can't convert '%.100s' object to str implicitly",
3195 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003196 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003197}
3198
Alexander Belopolsky40018472011-02-26 01:02:56 +00003199PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003200PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003201 const char *encoding,
3202 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003203{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003204 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003205 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003206
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003208 PyErr_BadInternalCall();
3209 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003210 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003211
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003212 /* Decoding bytes objects is the most common case and should be fast */
3213 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003214 if (PyBytes_GET_SIZE(obj) == 0)
3215 _Py_RETURN_UNICODE_EMPTY();
3216 v = PyUnicode_Decode(
3217 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3218 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003219 return v;
3220 }
3221
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003222 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003223 PyErr_SetString(PyExc_TypeError,
3224 "decoding str is not supported");
3225 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003226 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003227
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003228 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3229 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3230 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003231 "decoding to str: need a bytes-like object, %.80s found",
3232 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003233 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003234 }
Tim Petersced69f82003-09-16 20:30:58 +00003235
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003236 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003237 PyBuffer_Release(&buffer);
3238 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003240
Serhiy Storchaka05997252013-01-26 12:14:02 +02003241 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003242 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003243 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003244}
3245
Victor Stinnerebe17e02016-10-12 13:57:45 +02003246/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3247 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3248 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003249int
3250_Py_normalize_encoding(const char *encoding,
3251 char *lower,
3252 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003254 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003255 char *l;
3256 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003257 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003258
Victor Stinner942889a2016-09-05 15:40:10 -07003259 assert(encoding != NULL);
3260
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003261 e = encoding;
3262 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003263 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003264 punct = 0;
3265 while (1) {
3266 char c = *e;
3267 if (c == 0) {
3268 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003269 }
Victor Stinner942889a2016-09-05 15:40:10 -07003270
3271 if (Py_ISALNUM(c) || c == '.') {
3272 if (punct && l != lower) {
3273 if (l == l_end) {
3274 return 0;
3275 }
3276 *l++ = '_';
3277 }
3278 punct = 0;
3279
3280 if (l == l_end) {
3281 return 0;
3282 }
3283 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003284 }
3285 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003286 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003287 }
Victor Stinner942889a2016-09-05 15:40:10 -07003288
3289 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003290 }
3291 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003292 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003293}
3294
Alexander Belopolsky40018472011-02-26 01:02:56 +00003295PyObject *
3296PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003297 Py_ssize_t size,
3298 const char *encoding,
3299 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003300{
3301 PyObject *buffer = NULL, *unicode;
3302 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003303 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3304
3305 if (encoding == NULL) {
3306 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3307 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003308
Fred Drakee4315f52000-05-09 19:53:39 +00003309 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003310 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3311 char *lower = buflower;
3312
3313 /* Fast paths */
3314 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3315 lower += 3;
3316 if (*lower == '_') {
3317 /* Match "utf8" and "utf_8" */
3318 lower++;
3319 }
3320
3321 if (lower[0] == '8' && lower[1] == 0) {
3322 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3323 }
3324 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3325 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3326 }
3327 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3328 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3329 }
3330 }
3331 else {
3332 if (strcmp(lower, "ascii") == 0
3333 || strcmp(lower, "us_ascii") == 0) {
3334 return PyUnicode_DecodeASCII(s, size, errors);
3335 }
Steve Dowercc16be82016-09-08 10:35:16 -07003336 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003337 else if (strcmp(lower, "mbcs") == 0) {
3338 return PyUnicode_DecodeMBCS(s, size, errors);
3339 }
3340 #endif
3341 else if (strcmp(lower, "latin1") == 0
3342 || strcmp(lower, "latin_1") == 0
3343 || strcmp(lower, "iso_8859_1") == 0
3344 || strcmp(lower, "iso8859_1") == 0) {
3345 return PyUnicode_DecodeLatin1(s, size, errors);
3346 }
3347 }
Victor Stinner37296e82010-06-10 13:36:23 +00003348 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003349
3350 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003351 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003352 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003353 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003354 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003355 if (buffer == NULL)
3356 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003357 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003358 if (unicode == NULL)
3359 goto onError;
3360 if (!PyUnicode_Check(unicode)) {
3361 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003362 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003363 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003364 encoding,
3365 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366 Py_DECREF(unicode);
3367 goto onError;
3368 }
3369 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003370 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003371
Benjamin Peterson29060642009-01-31 22:14:21 +00003372 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003373 Py_XDECREF(buffer);
3374 return NULL;
3375}
3376
Alexander Belopolsky40018472011-02-26 01:02:56 +00003377PyObject *
3378PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003379 const char *encoding,
3380 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003381{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003382 if (!PyUnicode_Check(unicode)) {
3383 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003384 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003385 }
3386
Serhiy Storchaka00939072016-10-27 21:05:49 +03003387 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3388 "PyUnicode_AsDecodedObject() is deprecated; "
3389 "use PyCodec_Decode() to decode from str", 1) < 0)
3390 return NULL;
3391
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003392 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003393 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003394
3395 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003396 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003397}
3398
Alexander Belopolsky40018472011-02-26 01:02:56 +00003399PyObject *
3400PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003401 const char *encoding,
3402 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003403{
3404 PyObject *v;
3405
3406 if (!PyUnicode_Check(unicode)) {
3407 PyErr_BadArgument();
3408 goto onError;
3409 }
3410
Serhiy Storchaka00939072016-10-27 21:05:49 +03003411 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3412 "PyUnicode_AsDecodedUnicode() is deprecated; "
3413 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3414 return NULL;
3415
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003416 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003417 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003418
3419 /* Decode via the codec registry */
3420 v = PyCodec_Decode(unicode, encoding, errors);
3421 if (v == NULL)
3422 goto onError;
3423 if (!PyUnicode_Check(v)) {
3424 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003425 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003426 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003427 encoding,
3428 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003429 Py_DECREF(v);
3430 goto onError;
3431 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003432 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003433
Benjamin Peterson29060642009-01-31 22:14:21 +00003434 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003435 return NULL;
3436}
3437
Alexander Belopolsky40018472011-02-26 01:02:56 +00003438PyObject *
3439PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003440 Py_ssize_t size,
3441 const char *encoding,
3442 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003443{
3444 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003445
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003446 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003448 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003449 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3450 Py_DECREF(unicode);
3451 return v;
3452}
3453
Alexander Belopolsky40018472011-02-26 01:02:56 +00003454PyObject *
3455PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003456 const char *encoding,
3457 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003458{
3459 PyObject *v;
3460
3461 if (!PyUnicode_Check(unicode)) {
3462 PyErr_BadArgument();
3463 goto onError;
3464 }
3465
Serhiy Storchaka00939072016-10-27 21:05:49 +03003466 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3467 "PyUnicode_AsEncodedObject() is deprecated; "
3468 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3469 "or PyCodec_Encode() for generic encoding", 1) < 0)
3470 return NULL;
3471
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003472 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003473 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003474
3475 /* Encode via the codec registry */
3476 v = PyCodec_Encode(unicode, encoding, errors);
3477 if (v == NULL)
3478 goto onError;
3479 return v;
3480
Benjamin Peterson29060642009-01-31 22:14:21 +00003481 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003482 return NULL;
3483}
3484
Victor Stinner1b579672011-12-17 05:47:23 +01003485
Victor Stinner2cba6b82018-01-10 22:46:15 +01003486static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003487unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003488 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003489{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003490 Py_ssize_t wlen;
3491 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3492 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003493 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003494 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003495
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003496 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003497 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003498 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003499 return NULL;
3500 }
3501
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003502 char *str;
3503 size_t error_pos;
3504 const char *reason;
3505 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003506 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003507 PyMem_Free(wstr);
3508
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003509 if (res != 0) {
3510 if (res == -2) {
3511 PyObject *exc;
3512 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3513 "locale", unicode,
3514 (Py_ssize_t)error_pos,
3515 (Py_ssize_t)(error_pos+1),
3516 reason);
3517 if (exc != NULL) {
3518 PyCodec_StrictErrors(exc);
3519 Py_DECREF(exc);
3520 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003521 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003522 else if (res == -3) {
3523 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3524 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003525 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003526 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003527 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003528 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003529 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003530
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003531 PyObject *bytes = PyBytes_FromString(str);
3532 PyMem_RawFree(str);
3533 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003534}
3535
Victor Stinnerad158722010-10-27 00:25:46 +00003536PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003537PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3538{
Victor Stinner709d23d2019-05-02 14:56:30 -04003539 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3540 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003541}
3542
3543PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003544PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003545{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003546 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnere2510952019-05-02 11:28:57 -04003547#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner709d23d2019-05-02 14:56:30 -04003548 if (interp->fs_codec.encoding) {
3549 return unicode_encode_utf8(unicode,
3550 interp->fs_codec.error_handler,
3551 interp->fs_codec.errors);
3552 }
3553 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003554 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003555 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003556 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003557 assert(errors != _Py_ERROR_UNKNOWN);
3558 return unicode_encode_utf8(unicode, errors, NULL);
3559 }
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003560#else
Victor Stinner793b5312011-04-27 00:24:21 +02003561 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3562 cannot use it to encode and decode filenames before it is loaded. Load
3563 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003564 implementation of the locale codec until the codec registry is
3565 initialized and the Python codec is loaded. See initfsencoding(). */
Victor Stinner709d23d2019-05-02 14:56:30 -04003566 if (interp->fs_codec.encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003567 return PyUnicode_AsEncodedString(unicode,
Victor Stinner709d23d2019-05-02 14:56:30 -04003568 interp->fs_codec.encoding,
3569 interp->fs_codec.errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003570 }
3571 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003572 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003573 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003574 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003575 assert(errors != _Py_ERROR_UNKNOWN);
3576 return unicode_encode_locale(unicode, errors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003577 }
Victor Stinnerad158722010-10-27 00:25:46 +00003578#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003579}
3580
Alexander Belopolsky40018472011-02-26 01:02:56 +00003581PyObject *
3582PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003583 const char *encoding,
3584 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003585{
3586 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003587 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003588
Guido van Rossumd57fd912000-03-10 22:53:23 +00003589 if (!PyUnicode_Check(unicode)) {
3590 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003591 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003592 }
Fred Drakee4315f52000-05-09 19:53:39 +00003593
Victor Stinner942889a2016-09-05 15:40:10 -07003594 if (encoding == NULL) {
3595 return _PyUnicode_AsUTF8String(unicode, errors);
3596 }
3597
Fred Drakee4315f52000-05-09 19:53:39 +00003598 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003599 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3600 char *lower = buflower;
3601
3602 /* Fast paths */
3603 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3604 lower += 3;
3605 if (*lower == '_') {
3606 /* Match "utf8" and "utf_8" */
3607 lower++;
3608 }
3609
3610 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003611 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003612 }
3613 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3614 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3615 }
3616 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3617 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3618 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003619 }
Victor Stinner942889a2016-09-05 15:40:10 -07003620 else {
3621 if (strcmp(lower, "ascii") == 0
3622 || strcmp(lower, "us_ascii") == 0) {
3623 return _PyUnicode_AsASCIIString(unicode, errors);
3624 }
Steve Dowercc16be82016-09-08 10:35:16 -07003625#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003626 else if (strcmp(lower, "mbcs") == 0) {
3627 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3628 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003629#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003630 else if (strcmp(lower, "latin1") == 0 ||
3631 strcmp(lower, "latin_1") == 0 ||
3632 strcmp(lower, "iso_8859_1") == 0 ||
3633 strcmp(lower, "iso8859_1") == 0) {
3634 return _PyUnicode_AsLatin1String(unicode, errors);
3635 }
3636 }
Victor Stinner37296e82010-06-10 13:36:23 +00003637 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003638
3639 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003640 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003641 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003642 return NULL;
3643
3644 /* The normal path */
3645 if (PyBytes_Check(v))
3646 return v;
3647
3648 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003649 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003650 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003651 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003652
3653 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003654 "encoder %s returned bytearray instead of bytes; "
3655 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003656 encoding);
3657 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003658 Py_DECREF(v);
3659 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003660 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003661
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003662 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3663 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003664 Py_DECREF(v);
3665 return b;
3666 }
3667
3668 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003669 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003670 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003671 encoding,
3672 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003673 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003674 return NULL;
3675}
3676
Alexander Belopolsky40018472011-02-26 01:02:56 +00003677PyObject *
3678PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003679 const char *encoding,
3680 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003681{
3682 PyObject *v;
3683
3684 if (!PyUnicode_Check(unicode)) {
3685 PyErr_BadArgument();
3686 goto onError;
3687 }
3688
Serhiy Storchaka00939072016-10-27 21:05:49 +03003689 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3690 "PyUnicode_AsEncodedUnicode() is deprecated; "
3691 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3692 return NULL;
3693
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003694 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003695 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003696
3697 /* Encode via the codec registry */
3698 v = PyCodec_Encode(unicode, encoding, errors);
3699 if (v == NULL)
3700 goto onError;
3701 if (!PyUnicode_Check(v)) {
3702 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003703 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003704 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003705 encoding,
3706 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003707 Py_DECREF(v);
3708 goto onError;
3709 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003710 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003711
Benjamin Peterson29060642009-01-31 22:14:21 +00003712 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003713 return NULL;
3714}
3715
Victor Stinner2cba6b82018-01-10 22:46:15 +01003716static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003717unicode_decode_locale(const char *str, Py_ssize_t len,
3718 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003719{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003720 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3721 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003722 return NULL;
3723 }
3724
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003725 wchar_t *wstr;
3726 size_t wlen;
3727 const char *reason;
3728 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003729 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003730 if (res != 0) {
3731 if (res == -2) {
3732 PyObject *exc;
3733 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3734 "locale", str, len,
3735 (Py_ssize_t)wlen,
3736 (Py_ssize_t)(wlen + 1),
3737 reason);
3738 if (exc != NULL) {
3739 PyCodec_StrictErrors(exc);
3740 Py_DECREF(exc);
3741 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003742 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003743 else if (res == -3) {
3744 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3745 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003746 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003747 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003748 }
Victor Stinner2f197072011-12-17 07:08:30 +01003749 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003750 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003751
3752 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3753 PyMem_RawFree(wstr);
3754 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003755}
3756
3757PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003758PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3759 const char *errors)
3760{
Victor Stinner709d23d2019-05-02 14:56:30 -04003761 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3762 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003763}
3764
3765PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003766PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003767{
3768 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003769 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3770 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003771}
3772
3773
3774PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003775PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003776 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003777 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3778}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003779
Christian Heimes5894ba72007-11-04 11:43:14 +00003780PyObject*
3781PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3782{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003783 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnere2510952019-05-02 11:28:57 -04003784#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner709d23d2019-05-02 14:56:30 -04003785 if (interp->fs_codec.encoding) {
3786 return unicode_decode_utf8(s, size,
3787 interp->fs_codec.error_handler,
3788 interp->fs_codec.errors,
3789 NULL);
3790 }
3791 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003792 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003793 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003794 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003795 assert(errors != _Py_ERROR_UNKNOWN);
3796 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3797 }
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003798#else
Victor Stinner793b5312011-04-27 00:24:21 +02003799 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3800 cannot use it to encode and decode filenames before it is loaded. Load
3801 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003802 implementation of the locale codec until the codec registry is
3803 initialized and the Python codec is loaded. See initfsencoding(). */
Victor Stinner709d23d2019-05-02 14:56:30 -04003804 if (interp->fs_codec.encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003805 return PyUnicode_Decode(s, size,
Victor Stinner709d23d2019-05-02 14:56:30 -04003806 interp->fs_codec.encoding,
3807 interp->fs_codec.errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003808 }
3809 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003810 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003811 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003812 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003813 return unicode_decode_locale(s, size, errors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003814 }
Victor Stinnerad158722010-10-27 00:25:46 +00003815#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003816}
3817
Martin v. Löwis011e8422009-05-05 04:43:17 +00003818
3819int
3820PyUnicode_FSConverter(PyObject* arg, void* addr)
3821{
Brett Cannonec6ce872016-09-06 15:50:29 -07003822 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003823 PyObject *output = NULL;
3824 Py_ssize_t size;
3825 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003826 if (arg == NULL) {
3827 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003828 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003829 return 1;
3830 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003831 path = PyOS_FSPath(arg);
3832 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003833 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003834 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003835 if (PyBytes_Check(path)) {
3836 output = path;
3837 }
3838 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3839 output = PyUnicode_EncodeFSDefault(path);
3840 Py_DECREF(path);
3841 if (!output) {
3842 return 0;
3843 }
3844 assert(PyBytes_Check(output));
3845 }
3846
Victor Stinner0ea2a462010-04-30 00:22:08 +00003847 size = PyBytes_GET_SIZE(output);
3848 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003849 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003850 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003851 Py_DECREF(output);
3852 return 0;
3853 }
3854 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003855 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003856}
3857
3858
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003859int
3860PyUnicode_FSDecoder(PyObject* arg, void* addr)
3861{
Brett Cannona5711202016-09-06 19:36:01 -07003862 int is_buffer = 0;
3863 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003864 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003865 if (arg == NULL) {
3866 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003867 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003868 return 1;
3869 }
Brett Cannona5711202016-09-06 19:36:01 -07003870
3871 is_buffer = PyObject_CheckBuffer(arg);
3872 if (!is_buffer) {
3873 path = PyOS_FSPath(arg);
3874 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003875 return 0;
3876 }
Brett Cannona5711202016-09-06 19:36:01 -07003877 }
3878 else {
3879 path = arg;
3880 Py_INCREF(arg);
3881 }
3882
3883 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003884 output = path;
3885 }
3886 else if (PyBytes_Check(path) || is_buffer) {
3887 PyObject *path_bytes = NULL;
3888
3889 if (!PyBytes_Check(path) &&
3890 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003891 "path should be string, bytes, or os.PathLike, not %.200s",
3892 Py_TYPE(arg)->tp_name)) {
3893 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003894 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003895 }
3896 path_bytes = PyBytes_FromObject(path);
3897 Py_DECREF(path);
3898 if (!path_bytes) {
3899 return 0;
3900 }
3901 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3902 PyBytes_GET_SIZE(path_bytes));
3903 Py_DECREF(path_bytes);
3904 if (!output) {
3905 return 0;
3906 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003907 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003908 else {
3909 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003910 "path should be string, bytes, or os.PathLike, not %.200s",
3911 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003912 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003913 return 0;
3914 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003915 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003916 Py_DECREF(output);
3917 return 0;
3918 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003919 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003920 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003921 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003922 Py_DECREF(output);
3923 return 0;
3924 }
3925 *(PyObject**)addr = output;
3926 return Py_CLEANUP_SUPPORTED;
3927}
3928
3929
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003930const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003931PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003932{
Christian Heimesf3863112007-11-22 07:46:41 +00003933 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003934
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003935 if (!PyUnicode_Check(unicode)) {
3936 PyErr_BadArgument();
3937 return NULL;
3938 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003939 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003940 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003941
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003942 if (PyUnicode_UTF8(unicode) == NULL) {
3943 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003944 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003945 if (bytes == NULL)
3946 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003947 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3948 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003949 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003950 Py_DECREF(bytes);
3951 return NULL;
3952 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003953 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003954 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003955 PyBytes_AS_STRING(bytes),
3956 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003957 Py_DECREF(bytes);
3958 }
3959
3960 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003961 *psize = PyUnicode_UTF8_LENGTH(unicode);
3962 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003963}
3964
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003965const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003966PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003967{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003968 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3969}
3970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003971Py_UNICODE *
3972PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3973{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003974 if (!PyUnicode_Check(unicode)) {
3975 PyErr_BadArgument();
3976 return NULL;
3977 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003978 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
3979 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003980 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003981 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003982 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003983
Serhiy Storchakac46db922018-10-23 22:58:24 +03003984 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
3985 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3986 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003987 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003988 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003989 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
3990 if (w == NULL) {
3991 PyErr_NoMemory();
3992 return NULL;
3993 }
3994 unicode_copy_as_widechar(unicode, w, wlen + 1);
3995 _PyUnicode_WSTR(unicode) = w;
3996 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
3997 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003998 }
3999 }
4000 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004001 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004002 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004003}
4004
Alexander Belopolsky40018472011-02-26 01:02:56 +00004005Py_UNICODE *
4006PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004007{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004008 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004009}
4010
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004011const Py_UNICODE *
4012_PyUnicode_AsUnicode(PyObject *unicode)
4013{
4014 Py_ssize_t size;
4015 const Py_UNICODE *wstr;
4016
4017 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4018 if (wstr && wcslen(wstr) != (size_t)size) {
4019 PyErr_SetString(PyExc_ValueError, "embedded null character");
4020 return NULL;
4021 }
4022 return wstr;
4023}
4024
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004025
Alexander Belopolsky40018472011-02-26 01:02:56 +00004026Py_ssize_t
4027PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004028{
4029 if (!PyUnicode_Check(unicode)) {
4030 PyErr_BadArgument();
4031 goto onError;
4032 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004033 if (_PyUnicode_WSTR(unicode) == NULL) {
4034 if (PyUnicode_AsUnicode(unicode) == NULL)
4035 goto onError;
4036 }
4037 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004038
Benjamin Peterson29060642009-01-31 22:14:21 +00004039 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004040 return -1;
4041}
4042
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004043Py_ssize_t
4044PyUnicode_GetLength(PyObject *unicode)
4045{
Victor Stinner07621332012-06-16 04:53:46 +02004046 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004047 PyErr_BadArgument();
4048 return -1;
4049 }
Victor Stinner07621332012-06-16 04:53:46 +02004050 if (PyUnicode_READY(unicode) == -1)
4051 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004052 return PyUnicode_GET_LENGTH(unicode);
4053}
4054
4055Py_UCS4
4056PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4057{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004058 void *data;
4059 int kind;
4060
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004061 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004062 PyErr_BadArgument();
4063 return (Py_UCS4)-1;
4064 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004065 if (PyUnicode_READY(unicode) == -1) {
4066 return (Py_UCS4)-1;
4067 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004068 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004069 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004070 return (Py_UCS4)-1;
4071 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004072 data = PyUnicode_DATA(unicode);
4073 kind = PyUnicode_KIND(unicode);
4074 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004075}
4076
4077int
4078PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4079{
4080 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004081 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004082 return -1;
4083 }
Victor Stinner488fa492011-12-12 00:01:39 +01004084 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004085 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004086 PyErr_SetString(PyExc_IndexError, "string index out of range");
4087 return -1;
4088 }
Victor Stinner488fa492011-12-12 00:01:39 +01004089 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004090 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004091 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4092 PyErr_SetString(PyExc_ValueError, "character out of range");
4093 return -1;
4094 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004095 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4096 index, ch);
4097 return 0;
4098}
4099
Alexander Belopolsky40018472011-02-26 01:02:56 +00004100const char *
4101PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004102{
Victor Stinner42cb4622010-09-01 19:39:01 +00004103 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004104}
4105
Victor Stinner554f3f02010-06-16 23:33:54 +00004106/* create or adjust a UnicodeDecodeError */
4107static void
4108make_decode_exception(PyObject **exceptionObject,
4109 const char *encoding,
4110 const char *input, Py_ssize_t length,
4111 Py_ssize_t startpos, Py_ssize_t endpos,
4112 const char *reason)
4113{
4114 if (*exceptionObject == NULL) {
4115 *exceptionObject = PyUnicodeDecodeError_Create(
4116 encoding, input, length, startpos, endpos, reason);
4117 }
4118 else {
4119 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4120 goto onError;
4121 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4122 goto onError;
4123 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4124 goto onError;
4125 }
4126 return;
4127
4128onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004129 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004130}
4131
Steve Dowercc16be82016-09-08 10:35:16 -07004132#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004133static int
4134widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4135{
4136 if (newsize > *size) {
4137 wchar_t *newbuf = *buf;
4138 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4139 PyErr_NoMemory();
4140 return -1;
4141 }
4142 *buf = newbuf;
4143 }
4144 *size = newsize;
4145 return 0;
4146}
4147
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004148/* error handling callback helper:
4149 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004150 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004151 and adjust various state variables.
4152 return 0 on success, -1 on error
4153*/
4154
Alexander Belopolsky40018472011-02-26 01:02:56 +00004155static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004156unicode_decode_call_errorhandler_wchar(
4157 const char *errors, PyObject **errorHandler,
4158 const char *encoding, const char *reason,
4159 const char **input, const char **inend, Py_ssize_t *startinpos,
4160 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004161 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004162{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004163 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004164
4165 PyObject *restuple = NULL;
4166 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004167 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004168 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004169 Py_ssize_t requiredsize;
4170 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004171 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004172 wchar_t *repwstr;
4173 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004174
4175 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004176 *errorHandler = PyCodec_LookupError(errors);
4177 if (*errorHandler == NULL)
4178 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004179 }
4180
Victor Stinner554f3f02010-06-16 23:33:54 +00004181 make_decode_exception(exceptionObject,
4182 encoding,
4183 *input, *inend - *input,
4184 *startinpos, *endinpos,
4185 reason);
4186 if (*exceptionObject == NULL)
4187 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004188
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004189 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004190 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004191 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004192 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004193 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004194 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004195 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004196 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004197 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004198
4199 /* Copy back the bytes variables, which might have been modified by the
4200 callback */
4201 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4202 if (!inputobj)
4203 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004204 *input = PyBytes_AS_STRING(inputobj);
4205 insize = PyBytes_GET_SIZE(inputobj);
4206 *inend = *input + insize;
4207 /* we can DECREF safely, as the exception has another reference,
4208 so the object won't go away. */
4209 Py_DECREF(inputobj);
4210
4211 if (newpos<0)
4212 newpos = insize+newpos;
4213 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004214 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004215 goto onError;
4216 }
4217
4218 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4219 if (repwstr == NULL)
4220 goto onError;
4221 /* need more space? (at least enough for what we
4222 have+the replacement+the rest of the string (starting
4223 at the new input position), so we won't have to check space
4224 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004225 requiredsize = *outpos;
4226 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4227 goto overflow;
4228 requiredsize += repwlen;
4229 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4230 goto overflow;
4231 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004232 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004233 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004234 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004235 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004236 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004237 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004238 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004239 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004240 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004241 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004242 *endinpos = newpos;
4243 *inptr = *input + newpos;
4244
4245 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004246 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004247 return 0;
4248
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004249 overflow:
4250 PyErr_SetString(PyExc_OverflowError,
4251 "decoded result is too long for a Python string");
4252
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004253 onError:
4254 Py_XDECREF(restuple);
4255 return -1;
4256}
Steve Dowercc16be82016-09-08 10:35:16 -07004257#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004258
4259static int
4260unicode_decode_call_errorhandler_writer(
4261 const char *errors, PyObject **errorHandler,
4262 const char *encoding, const char *reason,
4263 const char **input, const char **inend, Py_ssize_t *startinpos,
4264 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4265 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4266{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004267 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004268
4269 PyObject *restuple = NULL;
4270 PyObject *repunicode = NULL;
4271 Py_ssize_t insize;
4272 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004273 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004274 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004275 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004276 int need_to_grow = 0;
4277 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004278
4279 if (*errorHandler == NULL) {
4280 *errorHandler = PyCodec_LookupError(errors);
4281 if (*errorHandler == NULL)
4282 goto onError;
4283 }
4284
4285 make_decode_exception(exceptionObject,
4286 encoding,
4287 *input, *inend - *input,
4288 *startinpos, *endinpos,
4289 reason);
4290 if (*exceptionObject == NULL)
4291 goto onError;
4292
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004293 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004294 if (restuple == NULL)
4295 goto onError;
4296 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004297 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004298 goto onError;
4299 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004300 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004301 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004302
4303 /* Copy back the bytes variables, which might have been modified by the
4304 callback */
4305 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4306 if (!inputobj)
4307 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004308 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004309 *input = PyBytes_AS_STRING(inputobj);
4310 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004311 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004312 /* we can DECREF safely, as the exception has another reference,
4313 so the object won't go away. */
4314 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004315
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004316 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004317 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004318 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004319 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004320 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004321 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004322
Victor Stinner170ca6f2013-04-18 00:25:28 +02004323 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004324 if (replen > 1) {
4325 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004326 need_to_grow = 1;
4327 }
4328 new_inptr = *input + newpos;
4329 if (*inend - new_inptr > remain) {
4330 /* We don't know the decoding algorithm here so we make the worst
4331 assumption that one byte decodes to one unicode character.
4332 If unfortunately one byte could decode to more unicode characters,
4333 the decoder may write out-of-bound then. Is it possible for the
4334 algorithms using this function? */
4335 writer->min_length += *inend - new_inptr - remain;
4336 need_to_grow = 1;
4337 }
4338 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004339 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004340 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004341 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4342 goto onError;
4343 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004344 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004345 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004346
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004347 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004348 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004349
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004350 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004351 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004352 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004353
Benjamin Peterson29060642009-01-31 22:14:21 +00004354 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004355 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004356 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004357}
4358
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004359/* --- UTF-7 Codec -------------------------------------------------------- */
4360
Antoine Pitrou244651a2009-05-04 18:56:13 +00004361/* See RFC2152 for details. We encode conservatively and decode liberally. */
4362
4363/* Three simple macros defining base-64. */
4364
4365/* Is c a base-64 character? */
4366
4367#define IS_BASE64(c) \
4368 (((c) >= 'A' && (c) <= 'Z') || \
4369 ((c) >= 'a' && (c) <= 'z') || \
4370 ((c) >= '0' && (c) <= '9') || \
4371 (c) == '+' || (c) == '/')
4372
4373/* given that c is a base-64 character, what is its base-64 value? */
4374
4375#define FROM_BASE64(c) \
4376 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4377 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4378 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4379 (c) == '+' ? 62 : 63)
4380
4381/* What is the base-64 character of the bottom 6 bits of n? */
4382
4383#define TO_BASE64(n) \
4384 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4385
4386/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4387 * decoded as itself. We are permissive on decoding; the only ASCII
4388 * byte not decoding to itself is the + which begins a base64
4389 * string. */
4390
4391#define DECODE_DIRECT(c) \
4392 ((c) <= 127 && (c) != '+')
4393
4394/* The UTF-7 encoder treats ASCII characters differently according to
4395 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4396 * the above). See RFC2152. This array identifies these different
4397 * sets:
4398 * 0 : "Set D"
4399 * alphanumeric and '(),-./:?
4400 * 1 : "Set O"
4401 * !"#$%&*;<=>@[]^_`{|}
4402 * 2 : "whitespace"
4403 * ht nl cr sp
4404 * 3 : special (must be base64 encoded)
4405 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4406 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004407
Tim Petersced69f82003-09-16 20:30:58 +00004408static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004409char utf7_category[128] = {
4410/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4411 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4412/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4413 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4414/* sp ! " # $ % & ' ( ) * + , - . / */
4415 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4416/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4417 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4418/* @ A B C D E F G H I J K L M N O */
4419 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4420/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4421 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4422/* ` a b c d e f g h i j k l m n o */
4423 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4424/* p q r s t u v w x y z { | } ~ del */
4425 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004426};
4427
Antoine Pitrou244651a2009-05-04 18:56:13 +00004428/* ENCODE_DIRECT: this character should be encoded as itself. The
4429 * answer depends on whether we are encoding set O as itself, and also
4430 * on whether we are encoding whitespace as itself. RFC2152 makes it
4431 * clear that the answers to these questions vary between
4432 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004433
Antoine Pitrou244651a2009-05-04 18:56:13 +00004434#define ENCODE_DIRECT(c, directO, directWS) \
4435 ((c) < 128 && (c) > 0 && \
4436 ((utf7_category[(c)] == 0) || \
4437 (directWS && (utf7_category[(c)] == 2)) || \
4438 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004439
Alexander Belopolsky40018472011-02-26 01:02:56 +00004440PyObject *
4441PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004442 Py_ssize_t size,
4443 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004444{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004445 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4446}
4447
Antoine Pitrou244651a2009-05-04 18:56:13 +00004448/* The decoder. The only state we preserve is our read position,
4449 * i.e. how many characters we have consumed. So if we end in the
4450 * middle of a shift sequence we have to back off the read position
4451 * and the output to the beginning of the sequence, otherwise we lose
4452 * all the shift state (seen bits, number of bits seen, high
4453 * surrogate). */
4454
Alexander Belopolsky40018472011-02-26 01:02:56 +00004455PyObject *
4456PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004457 Py_ssize_t size,
4458 const char *errors,
4459 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004460{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004461 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004462 Py_ssize_t startinpos;
4463 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004464 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004465 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004466 const char *errmsg = "";
4467 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004468 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004469 unsigned int base64bits = 0;
4470 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004471 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004472 PyObject *errorHandler = NULL;
4473 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004474
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004475 if (size == 0) {
4476 if (consumed)
4477 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004478 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004479 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004480
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004481 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004482 _PyUnicodeWriter_Init(&writer);
4483 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004484
4485 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004486 e = s + size;
4487
4488 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004489 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004490 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004491 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004492
Antoine Pitrou244651a2009-05-04 18:56:13 +00004493 if (inShift) { /* in a base-64 section */
4494 if (IS_BASE64(ch)) { /* consume a base-64 character */
4495 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4496 base64bits += 6;
4497 s++;
4498 if (base64bits >= 16) {
4499 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004500 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004501 base64bits -= 16;
4502 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004503 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004504 if (surrogate) {
4505 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004506 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4507 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004508 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004509 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004510 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004511 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004512 }
4513 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004514 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004515 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004516 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004517 }
4518 }
Victor Stinner551ac952011-11-29 22:58:13 +01004519 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004520 /* first surrogate */
4521 surrogate = outCh;
4522 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004523 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004524 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004525 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004526 }
4527 }
4528 }
4529 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004530 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004531 if (base64bits > 0) { /* left-over bits */
4532 if (base64bits >= 6) {
4533 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004534 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004535 errmsg = "partial character in shift sequence";
4536 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004537 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004538 else {
4539 /* Some bits remain; they should be zero */
4540 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004541 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004542 errmsg = "non-zero padding bits in shift sequence";
4543 goto utf7Error;
4544 }
4545 }
4546 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004547 if (surrogate && DECODE_DIRECT(ch)) {
4548 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4549 goto onError;
4550 }
4551 surrogate = 0;
4552 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004553 /* '-' is absorbed; other terminating
4554 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004555 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004556 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004557 }
4558 }
4559 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004560 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004561 s++; /* consume '+' */
4562 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004563 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004564 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004565 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004566 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004567 else if (s < e && !IS_BASE64(*s)) {
4568 s++;
4569 errmsg = "ill-formed sequence";
4570 goto utf7Error;
4571 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004572 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004573 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004574 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004575 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004576 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004577 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004578 }
4579 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004580 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004581 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004582 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004583 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004584 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004585 else {
4586 startinpos = s-starts;
4587 s++;
4588 errmsg = "unexpected special character";
4589 goto utf7Error;
4590 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004591 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004592utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004593 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004594 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004595 errors, &errorHandler,
4596 "utf7", errmsg,
4597 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004598 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004599 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004600 }
4601
Antoine Pitrou244651a2009-05-04 18:56:13 +00004602 /* end of string */
4603
4604 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4605 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004606 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004607 if (surrogate ||
4608 (base64bits >= 6) ||
4609 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004610 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004611 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004612 errors, &errorHandler,
4613 "utf7", "unterminated shift sequence",
4614 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004615 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004616 goto onError;
4617 if (s < e)
4618 goto restart;
4619 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004620 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004621
4622 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004623 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004624 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004625 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004626 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004627 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004628 writer.kind, writer.data, shiftOutStart);
4629 Py_XDECREF(errorHandler);
4630 Py_XDECREF(exc);
4631 _PyUnicodeWriter_Dealloc(&writer);
4632 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004633 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004634 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004635 }
4636 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004637 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004638 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004639 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004640
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004641 Py_XDECREF(errorHandler);
4642 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004643 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004644
Benjamin Peterson29060642009-01-31 22:14:21 +00004645 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004646 Py_XDECREF(errorHandler);
4647 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004648 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004649 return NULL;
4650}
4651
4652
Alexander Belopolsky40018472011-02-26 01:02:56 +00004653PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004654_PyUnicode_EncodeUTF7(PyObject *str,
4655 int base64SetO,
4656 int base64WhiteSpace,
4657 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004658{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004659 int kind;
4660 void *data;
4661 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004662 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004663 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004664 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004665 unsigned int base64bits = 0;
4666 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004667 char * out;
4668 char * start;
4669
Benjamin Petersonbac79492012-01-14 13:34:47 -05004670 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004671 return NULL;
4672 kind = PyUnicode_KIND(str);
4673 data = PyUnicode_DATA(str);
4674 len = PyUnicode_GET_LENGTH(str);
4675
4676 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004677 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004678
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004679 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004680 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004681 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004682 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004683 if (v == NULL)
4684 return NULL;
4685
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004686 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004687 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004688 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004689
Antoine Pitrou244651a2009-05-04 18:56:13 +00004690 if (inShift) {
4691 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4692 /* shifting out */
4693 if (base64bits) { /* output remaining bits */
4694 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4695 base64buffer = 0;
4696 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004697 }
4698 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004699 /* Characters not in the BASE64 set implicitly unshift the sequence
4700 so no '-' is required, except if the character is itself a '-' */
4701 if (IS_BASE64(ch) || ch == '-') {
4702 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004703 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004704 *out++ = (char) ch;
4705 }
4706 else {
4707 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004708 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004709 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004710 else { /* not in a shift sequence */
4711 if (ch == '+') {
4712 *out++ = '+';
4713 *out++ = '-';
4714 }
4715 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4716 *out++ = (char) ch;
4717 }
4718 else {
4719 *out++ = '+';
4720 inShift = 1;
4721 goto encode_char;
4722 }
4723 }
4724 continue;
4725encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004726 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004727 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004728
Antoine Pitrou244651a2009-05-04 18:56:13 +00004729 /* code first surrogate */
4730 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004731 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004732 while (base64bits >= 6) {
4733 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4734 base64bits -= 6;
4735 }
4736 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004737 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004738 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004739 base64bits += 16;
4740 base64buffer = (base64buffer << 16) | ch;
4741 while (base64bits >= 6) {
4742 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4743 base64bits -= 6;
4744 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004745 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004746 if (base64bits)
4747 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4748 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004749 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004750 if (_PyBytes_Resize(&v, out - start) < 0)
4751 return NULL;
4752 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004753}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004754PyObject *
4755PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4756 Py_ssize_t size,
4757 int base64SetO,
4758 int base64WhiteSpace,
4759 const char *errors)
4760{
4761 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004762 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004763 if (tmp == NULL)
4764 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004765 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004766 base64WhiteSpace, errors);
4767 Py_DECREF(tmp);
4768 return result;
4769}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004770
Antoine Pitrou244651a2009-05-04 18:56:13 +00004771#undef IS_BASE64
4772#undef FROM_BASE64
4773#undef TO_BASE64
4774#undef DECODE_DIRECT
4775#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004776
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777/* --- UTF-8 Codec -------------------------------------------------------- */
4778
Alexander Belopolsky40018472011-02-26 01:02:56 +00004779PyObject *
4780PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004781 Py_ssize_t size,
4782 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783{
Walter Dörwald69652032004-09-07 20:24:22 +00004784 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4785}
4786
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004787#include "stringlib/asciilib.h"
4788#include "stringlib/codecs.h"
4789#include "stringlib/undef.h"
4790
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004791#include "stringlib/ucs1lib.h"
4792#include "stringlib/codecs.h"
4793#include "stringlib/undef.h"
4794
4795#include "stringlib/ucs2lib.h"
4796#include "stringlib/codecs.h"
4797#include "stringlib/undef.h"
4798
4799#include "stringlib/ucs4lib.h"
4800#include "stringlib/codecs.h"
4801#include "stringlib/undef.h"
4802
Antoine Pitrouab868312009-01-10 15:40:25 +00004803/* Mask to quickly check whether a C 'long' contains a
4804 non-ASCII, UTF8-encoded char. */
4805#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004806# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004807#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004808# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004809#else
4810# error C 'long' size should be either 4 or 8!
4811#endif
4812
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004813static Py_ssize_t
4814ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004815{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004816 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004817 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004818
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004819 /*
4820 * Issue #17237: m68k is a bit different from most architectures in
4821 * that objects do not use "natural alignment" - for example, int and
4822 * long are only aligned at 2-byte boundaries. Therefore the assert()
4823 * won't work; also, tests have shown that skipping the "optimised
4824 * version" will even speed up m68k.
4825 */
4826#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004827#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004828 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4829 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004830 /* Fast path, see in STRINGLIB(utf8_decode) for
4831 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004832 /* Help allocation */
4833 const char *_p = p;
4834 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004835 while (_p < aligned_end) {
4836 unsigned long value = *(const unsigned long *) _p;
4837 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004838 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004839 *((unsigned long *)q) = value;
4840 _p += SIZEOF_LONG;
4841 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004842 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004843 p = _p;
4844 while (p < end) {
4845 if ((unsigned char)*p & 0x80)
4846 break;
4847 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004849 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004850 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004851#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004852#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004853 while (p < end) {
4854 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4855 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004856 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004857 /* Help allocation */
4858 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004859 while (_p < aligned_end) {
4860 unsigned long value = *(unsigned long *) _p;
4861 if (value & ASCII_CHAR_MASK)
4862 break;
4863 _p += SIZEOF_LONG;
4864 }
4865 p = _p;
4866 if (_p == end)
4867 break;
4868 }
4869 if ((unsigned char)*p & 0x80)
4870 break;
4871 ++p;
4872 }
4873 memcpy(dest, start, p - start);
4874 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004875}
Antoine Pitrouab868312009-01-10 15:40:25 +00004876
Victor Stinner709d23d2019-05-02 14:56:30 -04004877static PyObject *
4878unicode_decode_utf8(const char *s, Py_ssize_t size,
4879 _Py_error_handler error_handler, const char *errors,
4880 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004881{
Victor Stinner785938e2011-12-11 20:09:03 +01004882 if (size == 0) {
4883 if (consumed)
4884 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004885 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004886 }
4887
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004888 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4889 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004890 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004891 *consumed = 1;
4892 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004893 }
4894
Inada Naoki770847a2019-06-24 12:30:24 +09004895 const char *starts = s;
4896 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01004897
Inada Naoki770847a2019-06-24 12:30:24 +09004898 // fast path: try ASCII string.
4899 PyObject *u = PyUnicode_New(size, 127);
4900 if (u == NULL) {
4901 return NULL;
4902 }
4903 s += ascii_decode(s, end, PyUnicode_DATA(u));
4904 if (s == end) {
4905 return u;
4906 }
4907
4908 // Use _PyUnicodeWriter after fast path is failed.
4909 _PyUnicodeWriter writer;
4910 _PyUnicodeWriter_InitWithBuffer(&writer, u);
4911 writer.pos = s - starts;
4912
4913 Py_ssize_t startinpos, endinpos;
4914 const char *errmsg = "";
4915 PyObject *error_handler_obj = NULL;
4916 PyObject *exc = NULL;
4917
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004918 while (s < end) {
4919 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004920 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004921
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004922 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004923 if (PyUnicode_IS_ASCII(writer.buffer))
4924 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004925 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004926 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004927 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004928 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004929 } else {
4930 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004931 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004932 }
4933
4934 switch (ch) {
4935 case 0:
4936 if (s == end || consumed)
4937 goto End;
4938 errmsg = "unexpected end of data";
4939 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004940 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004941 break;
4942 case 1:
4943 errmsg = "invalid start byte";
4944 startinpos = s - starts;
4945 endinpos = startinpos + 1;
4946 break;
4947 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03004948 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
4949 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
4950 {
4951 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02004952 goto End;
4953 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03004954 /* fall through */
4955 case 3:
4956 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004957 errmsg = "invalid continuation byte";
4958 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004959 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004960 break;
4961 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004962 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004963 goto onError;
4964 continue;
4965 }
4966
Victor Stinner1d65d912015-10-05 13:43:50 +02004967 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02004968 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02004969
4970 switch (error_handler) {
4971 case _Py_ERROR_IGNORE:
4972 s += (endinpos - startinpos);
4973 break;
4974
4975 case _Py_ERROR_REPLACE:
4976 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4977 goto onError;
4978 s += (endinpos - startinpos);
4979 break;
4980
4981 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004982 {
4983 Py_ssize_t i;
4984
Victor Stinner1d65d912015-10-05 13:43:50 +02004985 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4986 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004987 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004988 ch = (Py_UCS4)(unsigned char)(starts[i]);
4989 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4990 ch + 0xdc00);
4991 writer.pos++;
4992 }
4993 s += (endinpos - startinpos);
4994 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004995 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004996
4997 default:
4998 if (unicode_decode_call_errorhandler_writer(
4999 errors, &error_handler_obj,
5000 "utf-8", errmsg,
5001 &starts, &end, &startinpos, &endinpos, &exc, &s,
5002 &writer))
5003 goto onError;
5004 }
Victor Stinner785938e2011-12-11 20:09:03 +01005005 }
5006
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005007End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005008 if (consumed)
5009 *consumed = s - starts;
5010
Victor Stinner1d65d912015-10-05 13:43:50 +02005011 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005012 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005013 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005014
5015onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005016 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005017 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005018 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005019 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005020}
5021
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005022
Victor Stinner709d23d2019-05-02 14:56:30 -04005023PyObject *
5024PyUnicode_DecodeUTF8Stateful(const char *s,
5025 Py_ssize_t size,
5026 const char *errors,
5027 Py_ssize_t *consumed)
5028{
5029 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5030}
5031
5032
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005033/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5034 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005035
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005036 On success, write a pointer to a newly allocated wide character string into
5037 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5038 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005039
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005040 On memory allocation failure, return -1.
5041
5042 On decoding error (if surrogateescape is zero), return -2. If wlen is
5043 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5044 is not NULL, write the decoding error message into *reason. */
5045int
5046_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005047 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005048{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005049 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005050 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005051 wchar_t *unicode;
5052 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005053
Victor Stinner3d4226a2018-08-29 22:21:32 +02005054 int surrogateescape = 0;
5055 int surrogatepass = 0;
5056 switch (errors)
5057 {
5058 case _Py_ERROR_STRICT:
5059 break;
5060 case _Py_ERROR_SURROGATEESCAPE:
5061 surrogateescape = 1;
5062 break;
5063 case _Py_ERROR_SURROGATEPASS:
5064 surrogatepass = 1;
5065 break;
5066 default:
5067 return -3;
5068 }
5069
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005070 /* Note: size will always be longer than the resulting Unicode
5071 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005072 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005073 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005074 }
5075
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005076 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005077 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005078 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005079 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005080
5081 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005082 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005083 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005084 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005085 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005086#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005087 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005088#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005089 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005090#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005091 if (ch > 0xFF) {
5092#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005093 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005094#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005095 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005096 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005097 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5098 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5099#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005100 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005101 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005102 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005103 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005104 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005105
5106 if (surrogateescape) {
5107 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5108 }
5109 else {
5110 /* Is it a valid three-byte code? */
5111 if (surrogatepass
5112 && (e - s) >= 3
5113 && (s[0] & 0xf0) == 0xe0
5114 && (s[1] & 0xc0) == 0x80
5115 && (s[2] & 0xc0) == 0x80)
5116 {
5117 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5118 s += 3;
5119 unicode[outpos++] = ch;
5120 }
5121 else {
5122 PyMem_RawFree(unicode );
5123 if (reason != NULL) {
5124 switch (ch) {
5125 case 0:
5126 *reason = "unexpected end of data";
5127 break;
5128 case 1:
5129 *reason = "invalid start byte";
5130 break;
5131 /* 2, 3, 4 */
5132 default:
5133 *reason = "invalid continuation byte";
5134 break;
5135 }
5136 }
5137 if (wlen != NULL) {
5138 *wlen = s - orig_s;
5139 }
5140 return -2;
5141 }
5142 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005143 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005144 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005145 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005146 if (wlen) {
5147 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005148 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005149 *wstr = unicode;
5150 return 0;
5151}
5152
Victor Stinner5f9cf232019-03-19 01:46:25 +01005153
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005154wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005155_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5156 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005157{
5158 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005159 int res = _Py_DecodeUTF8Ex(arg, arglen,
5160 &wstr, wlen,
5161 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005162 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005163 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5164 assert(res != -3);
5165 if (wlen) {
5166 *wlen = (size_t)res;
5167 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005168 return NULL;
5169 }
5170 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005171}
5172
Antoine Pitrouab868312009-01-10 15:40:25 +00005173
Victor Stinnere47e6982017-12-21 15:45:16 +01005174/* UTF-8 encoder using the surrogateescape error handler .
5175
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005176 On success, return 0 and write the newly allocated character string (use
5177 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005178
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005179 On encoding failure, return -2 and write the position of the invalid
5180 surrogate character into *error_pos (if error_pos is set) and the decoding
5181 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005182
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005183 On memory allocation failure, return -1. */
5184int
5185_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005186 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005187{
5188 const Py_ssize_t max_char_size = 4;
5189 Py_ssize_t len = wcslen(text);
5190
5191 assert(len >= 0);
5192
Victor Stinner3d4226a2018-08-29 22:21:32 +02005193 int surrogateescape = 0;
5194 int surrogatepass = 0;
5195 switch (errors)
5196 {
5197 case _Py_ERROR_STRICT:
5198 break;
5199 case _Py_ERROR_SURROGATEESCAPE:
5200 surrogateescape = 1;
5201 break;
5202 case _Py_ERROR_SURROGATEPASS:
5203 surrogatepass = 1;
5204 break;
5205 default:
5206 return -3;
5207 }
5208
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005209 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5210 return -1;
5211 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005212 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005213 if (raw_malloc) {
5214 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005215 }
5216 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005217 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005218 }
5219 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005220 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005221 }
5222
5223 char *p = bytes;
5224 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005225 for (i = 0; i < len; ) {
5226 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005227 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005228 i++;
5229#if Py_UNICODE_SIZE == 2
5230 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5231 && i < len
5232 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5233 {
5234 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5235 i++;
5236 }
5237#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005238
5239 if (ch < 0x80) {
5240 /* Encode ASCII */
5241 *p++ = (char) ch;
5242
5243 }
5244 else if (ch < 0x0800) {
5245 /* Encode Latin-1 */
5246 *p++ = (char)(0xc0 | (ch >> 6));
5247 *p++ = (char)(0x80 | (ch & 0x3f));
5248 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005249 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005250 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005251 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005252 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005253 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005254 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005255 if (reason != NULL) {
5256 *reason = "encoding error";
5257 }
5258 if (raw_malloc) {
5259 PyMem_RawFree(bytes);
5260 }
5261 else {
5262 PyMem_Free(bytes);
5263 }
5264 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005265 }
5266 *p++ = (char)(ch & 0xff);
5267 }
5268 else if (ch < 0x10000) {
5269 *p++ = (char)(0xe0 | (ch >> 12));
5270 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5271 *p++ = (char)(0x80 | (ch & 0x3f));
5272 }
5273 else { /* ch >= 0x10000 */
5274 assert(ch <= MAX_UNICODE);
5275 /* Encode UCS4 Unicode ordinals */
5276 *p++ = (char)(0xf0 | (ch >> 18));
5277 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5278 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5279 *p++ = (char)(0x80 | (ch & 0x3f));
5280 }
5281 }
5282 *p++ = '\0';
5283
5284 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005285 char *bytes2;
5286 if (raw_malloc) {
5287 bytes2 = PyMem_RawRealloc(bytes, final_size);
5288 }
5289 else {
5290 bytes2 = PyMem_Realloc(bytes, final_size);
5291 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005292 if (bytes2 == NULL) {
5293 if (error_pos != NULL) {
5294 *error_pos = (size_t)-1;
5295 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005296 if (raw_malloc) {
5297 PyMem_RawFree(bytes);
5298 }
5299 else {
5300 PyMem_Free(bytes);
5301 }
5302 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005303 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005304 *str = bytes2;
5305 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005306}
5307
5308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005309/* Primary internal function which creates utf8 encoded bytes objects.
5310
5311 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005312 and allocate exactly as much space needed at the end. Else allocate the
5313 maximum possible needed (4 result bytes per Unicode character), and return
5314 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005315*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005316static PyObject *
5317unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5318 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005319{
Victor Stinner6099a032011-12-18 14:22:26 +01005320 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005321 void *data;
5322 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005324 if (!PyUnicode_Check(unicode)) {
5325 PyErr_BadArgument();
5326 return NULL;
5327 }
5328
5329 if (PyUnicode_READY(unicode) == -1)
5330 return NULL;
5331
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005332 if (PyUnicode_UTF8(unicode))
5333 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5334 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005335
5336 kind = PyUnicode_KIND(unicode);
5337 data = PyUnicode_DATA(unicode);
5338 size = PyUnicode_GET_LENGTH(unicode);
5339
Benjamin Petersonead6b532011-12-20 17:23:42 -06005340 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005341 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005342 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005343 case PyUnicode_1BYTE_KIND:
5344 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5345 assert(!PyUnicode_IS_ASCII(unicode));
Victor Stinner709d23d2019-05-02 14:56:30 -04005346 return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005347 case PyUnicode_2BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005348 return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005349 case PyUnicode_4BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005350 return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005351 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352}
5353
Alexander Belopolsky40018472011-02-26 01:02:56 +00005354PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005355_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5356{
5357 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5358}
5359
5360
5361PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005362PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5363 Py_ssize_t size,
5364 const char *errors)
5365{
5366 PyObject *v, *unicode;
5367
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005368 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005369 if (unicode == NULL)
5370 return NULL;
5371 v = _PyUnicode_AsUTF8String(unicode, errors);
5372 Py_DECREF(unicode);
5373 return v;
5374}
5375
5376PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005377PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005379 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005380}
5381
Walter Dörwald41980ca2007-08-16 21:55:45 +00005382/* --- UTF-32 Codec ------------------------------------------------------- */
5383
5384PyObject *
5385PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005386 Py_ssize_t size,
5387 const char *errors,
5388 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005389{
5390 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5391}
5392
5393PyObject *
5394PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005395 Py_ssize_t size,
5396 const char *errors,
5397 int *byteorder,
5398 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005399{
5400 const char *starts = s;
5401 Py_ssize_t startinpos;
5402 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005403 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005404 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005405 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005406 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005407 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005408 PyObject *errorHandler = NULL;
5409 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005410
Walter Dörwald41980ca2007-08-16 21:55:45 +00005411 q = (unsigned char *)s;
5412 e = q + size;
5413
5414 if (byteorder)
5415 bo = *byteorder;
5416
5417 /* Check for BOM marks (U+FEFF) in the input and adjust current
5418 byte order setting accordingly. In native mode, the leading BOM
5419 mark is skipped, in all other modes, it is copied to the output
5420 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005421 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005422 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005423 if (bom == 0x0000FEFF) {
5424 bo = -1;
5425 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005426 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005427 else if (bom == 0xFFFE0000) {
5428 bo = 1;
5429 q += 4;
5430 }
5431 if (byteorder)
5432 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005433 }
5434
Victor Stinnere64322e2012-10-30 23:12:47 +01005435 if (q == e) {
5436 if (consumed)
5437 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005438 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005439 }
5440
Victor Stinnere64322e2012-10-30 23:12:47 +01005441#ifdef WORDS_BIGENDIAN
5442 le = bo < 0;
5443#else
5444 le = bo <= 0;
5445#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005446 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005447
Victor Stinner8f674cc2013-04-17 23:02:17 +02005448 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005449 writer.min_length = (e - q + 3) / 4;
5450 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005451 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005452
Victor Stinnere64322e2012-10-30 23:12:47 +01005453 while (1) {
5454 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005455 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005456
Victor Stinnere64322e2012-10-30 23:12:47 +01005457 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005458 enum PyUnicode_Kind kind = writer.kind;
5459 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005460 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005461 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005462 if (le) {
5463 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005464 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005465 if (ch > maxch)
5466 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005467 if (kind != PyUnicode_1BYTE_KIND &&
5468 Py_UNICODE_IS_SURROGATE(ch))
5469 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005470 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005471 q += 4;
5472 } while (q <= last);
5473 }
5474 else {
5475 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005476 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005477 if (ch > maxch)
5478 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005479 if (kind != PyUnicode_1BYTE_KIND &&
5480 Py_UNICODE_IS_SURROGATE(ch))
5481 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005482 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005483 q += 4;
5484 } while (q <= last);
5485 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005486 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005487 }
5488
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005489 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005490 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005491 startinpos = ((const char *)q) - starts;
5492 endinpos = startinpos + 4;
5493 }
5494 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005495 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005496 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005497 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005498 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005499 startinpos = ((const char *)q) - starts;
5500 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005501 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005502 else {
5503 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005504 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005505 goto onError;
5506 q += 4;
5507 continue;
5508 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005509 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005510 startinpos = ((const char *)q) - starts;
5511 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005512 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005513
5514 /* The remaining input chars are ignored if the callback
5515 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005516 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005517 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005518 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005519 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005520 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005521 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005522 }
5523
Walter Dörwald41980ca2007-08-16 21:55:45 +00005524 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005525 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005526
Walter Dörwald41980ca2007-08-16 21:55:45 +00005527 Py_XDECREF(errorHandler);
5528 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005529 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005530
Benjamin Peterson29060642009-01-31 22:14:21 +00005531 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005532 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005533 Py_XDECREF(errorHandler);
5534 Py_XDECREF(exc);
5535 return NULL;
5536}
5537
5538PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005539_PyUnicode_EncodeUTF32(PyObject *str,
5540 const char *errors,
5541 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005542{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005543 enum PyUnicode_Kind kind;
5544 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005545 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005546 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005547 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005548#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005549 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005550#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005551 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005552#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005553 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005554 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005555 PyObject *errorHandler = NULL;
5556 PyObject *exc = NULL;
5557 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005558
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005559 if (!PyUnicode_Check(str)) {
5560 PyErr_BadArgument();
5561 return NULL;
5562 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005563 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005564 return NULL;
5565 kind = PyUnicode_KIND(str);
5566 data = PyUnicode_DATA(str);
5567 len = PyUnicode_GET_LENGTH(str);
5568
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005569 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005570 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005571 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005572 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005573 if (v == NULL)
5574 return NULL;
5575
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005576 /* output buffer is 4-bytes aligned */
5577 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005578 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005579 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005580 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005581 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005582 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005583
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005584 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005585 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005586 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005587 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005588 else
5589 encoding = "utf-32";
5590
5591 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005592 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5593 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005594 }
5595
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005596 pos = 0;
5597 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005598 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005599
5600 if (kind == PyUnicode_2BYTE_KIND) {
5601 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5602 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005603 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005604 else {
5605 assert(kind == PyUnicode_4BYTE_KIND);
5606 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5607 &out, native_ordering);
5608 }
5609 if (pos == len)
5610 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005611
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005612 rep = unicode_encode_call_errorhandler(
5613 errors, &errorHandler,
5614 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005615 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005616 if (!rep)
5617 goto error;
5618
5619 if (PyBytes_Check(rep)) {
5620 repsize = PyBytes_GET_SIZE(rep);
5621 if (repsize & 3) {
5622 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005623 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005624 "surrogates not allowed");
5625 goto error;
5626 }
5627 moreunits = repsize / 4;
5628 }
5629 else {
5630 assert(PyUnicode_Check(rep));
5631 if (PyUnicode_READY(rep) < 0)
5632 goto error;
5633 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5634 if (!PyUnicode_IS_ASCII(rep)) {
5635 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005636 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005637 "surrogates not allowed");
5638 goto error;
5639 }
5640 }
5641
5642 /* four bytes are reserved for each surrogate */
5643 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005644 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005645 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005646 /* integer overflow */
5647 PyErr_NoMemory();
5648 goto error;
5649 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005650 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005651 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005652 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005653 }
5654
5655 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005656 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005657 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005658 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005659 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005660 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5661 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005662 }
5663
5664 Py_CLEAR(rep);
5665 }
5666
5667 /* Cut back to size actually needed. This is necessary for, for example,
5668 encoding of a string containing isolated surrogates and the 'ignore'
5669 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005670 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005671 if (nsize != PyBytes_GET_SIZE(v))
5672 _PyBytes_Resize(&v, nsize);
5673 Py_XDECREF(errorHandler);
5674 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005675 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005676 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005677 error:
5678 Py_XDECREF(rep);
5679 Py_XDECREF(errorHandler);
5680 Py_XDECREF(exc);
5681 Py_XDECREF(v);
5682 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005683}
5684
Alexander Belopolsky40018472011-02-26 01:02:56 +00005685PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005686PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5687 Py_ssize_t size,
5688 const char *errors,
5689 int byteorder)
5690{
5691 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005692 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005693 if (tmp == NULL)
5694 return NULL;
5695 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5696 Py_DECREF(tmp);
5697 return result;
5698}
5699
5700PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005701PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005702{
Victor Stinnerb960b342011-11-20 19:12:52 +01005703 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005704}
5705
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706/* --- UTF-16 Codec ------------------------------------------------------- */
5707
Tim Peters772747b2001-08-09 22:21:55 +00005708PyObject *
5709PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005710 Py_ssize_t size,
5711 const char *errors,
5712 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005713{
Walter Dörwald69652032004-09-07 20:24:22 +00005714 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5715}
5716
5717PyObject *
5718PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005719 Py_ssize_t size,
5720 const char *errors,
5721 int *byteorder,
5722 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005723{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005724 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005725 Py_ssize_t startinpos;
5726 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005727 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005728 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005729 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005730 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005731 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005732 PyObject *errorHandler = NULL;
5733 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005734 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735
Tim Peters772747b2001-08-09 22:21:55 +00005736 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005737 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738
5739 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005740 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005742 /* Check for BOM marks (U+FEFF) in the input and adjust current
5743 byte order setting accordingly. In native mode, the leading BOM
5744 mark is skipped, in all other modes, it is copied to the output
5745 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005746 if (bo == 0 && size >= 2) {
5747 const Py_UCS4 bom = (q[1] << 8) | q[0];
5748 if (bom == 0xFEFF) {
5749 q += 2;
5750 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005751 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005752 else if (bom == 0xFFFE) {
5753 q += 2;
5754 bo = 1;
5755 }
5756 if (byteorder)
5757 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005758 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005759
Antoine Pitrou63065d72012-05-15 23:48:04 +02005760 if (q == e) {
5761 if (consumed)
5762 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005763 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005764 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005765
Christian Heimes743e0cd2012-10-17 23:52:17 +02005766#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005767 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005768 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005769#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005770 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005771 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005772#endif
Tim Peters772747b2001-08-09 22:21:55 +00005773
Antoine Pitrou63065d72012-05-15 23:48:04 +02005774 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005775 character count normally. Error handler will take care of
5776 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005777 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005778 writer.min_length = (e - q + 1) / 2;
5779 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005780 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005781
Antoine Pitrou63065d72012-05-15 23:48:04 +02005782 while (1) {
5783 Py_UCS4 ch = 0;
5784 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005785 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005786 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005787 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005788 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005789 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005790 native_ordering);
5791 else
5792 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005793 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005794 native_ordering);
5795 } else if (kind == PyUnicode_2BYTE_KIND) {
5796 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005797 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005798 native_ordering);
5799 } else {
5800 assert(kind == PyUnicode_4BYTE_KIND);
5801 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005802 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005803 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005804 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005805 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005806
Antoine Pitrou63065d72012-05-15 23:48:04 +02005807 switch (ch)
5808 {
5809 case 0:
5810 /* remaining byte at the end? (size should be even) */
5811 if (q == e || consumed)
5812 goto End;
5813 errmsg = "truncated data";
5814 startinpos = ((const char *)q) - starts;
5815 endinpos = ((const char *)e) - starts;
5816 break;
5817 /* The remaining input chars are ignored if the callback
5818 chooses to skip the input */
5819 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005820 q -= 2;
5821 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005822 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005823 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005824 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005825 endinpos = ((const char *)e) - starts;
5826 break;
5827 case 2:
5828 errmsg = "illegal encoding";
5829 startinpos = ((const char *)q) - 2 - starts;
5830 endinpos = startinpos + 2;
5831 break;
5832 case 3:
5833 errmsg = "illegal UTF-16 surrogate";
5834 startinpos = ((const char *)q) - 4 - starts;
5835 endinpos = startinpos + 2;
5836 break;
5837 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005838 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005839 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005840 continue;
5841 }
5842
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005843 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005844 errors,
5845 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005846 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005847 &starts,
5848 (const char **)&e,
5849 &startinpos,
5850 &endinpos,
5851 &exc,
5852 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005853 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005854 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005855 }
5856
Antoine Pitrou63065d72012-05-15 23:48:04 +02005857End:
Walter Dörwald69652032004-09-07 20:24:22 +00005858 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005859 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005860
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005861 Py_XDECREF(errorHandler);
5862 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005863 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864
Benjamin Peterson29060642009-01-31 22:14:21 +00005865 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005866 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005867 Py_XDECREF(errorHandler);
5868 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869 return NULL;
5870}
5871
Tim Peters772747b2001-08-09 22:21:55 +00005872PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005873_PyUnicode_EncodeUTF16(PyObject *str,
5874 const char *errors,
5875 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005877 enum PyUnicode_Kind kind;
5878 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005879 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005880 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005881 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005882 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005883#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005884 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005885#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005886 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005887#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005888 const char *encoding;
5889 Py_ssize_t nsize, pos;
5890 PyObject *errorHandler = NULL;
5891 PyObject *exc = NULL;
5892 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005893
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005894 if (!PyUnicode_Check(str)) {
5895 PyErr_BadArgument();
5896 return NULL;
5897 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005898 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005899 return NULL;
5900 kind = PyUnicode_KIND(str);
5901 data = PyUnicode_DATA(str);
5902 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005903
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005904 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005905 if (kind == PyUnicode_4BYTE_KIND) {
5906 const Py_UCS4 *in = (const Py_UCS4 *)data;
5907 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005908 while (in < end) {
5909 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005910 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005911 }
5912 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005913 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005914 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005915 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005916 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005917 nsize = len + pairs + (byteorder == 0);
5918 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005919 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005921 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005923 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005924 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005925 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005926 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005927 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005928 }
5929 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005930 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005931 }
Tim Peters772747b2001-08-09 22:21:55 +00005932
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005933 if (kind == PyUnicode_1BYTE_KIND) {
5934 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5935 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005936 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005937
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005938 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005939 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005940 }
5941 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005942 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005943 }
5944 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005945 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005946 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005947
5948 pos = 0;
5949 while (pos < len) {
5950 Py_ssize_t repsize, moreunits;
5951
5952 if (kind == PyUnicode_2BYTE_KIND) {
5953 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5954 &out, native_ordering);
5955 }
5956 else {
5957 assert(kind == PyUnicode_4BYTE_KIND);
5958 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5959 &out, native_ordering);
5960 }
5961 if (pos == len)
5962 break;
5963
5964 rep = unicode_encode_call_errorhandler(
5965 errors, &errorHandler,
5966 encoding, "surrogates not allowed",
5967 str, &exc, pos, pos + 1, &pos);
5968 if (!rep)
5969 goto error;
5970
5971 if (PyBytes_Check(rep)) {
5972 repsize = PyBytes_GET_SIZE(rep);
5973 if (repsize & 1) {
5974 raise_encode_exception(&exc, encoding,
5975 str, pos - 1, pos,
5976 "surrogates not allowed");
5977 goto error;
5978 }
5979 moreunits = repsize / 2;
5980 }
5981 else {
5982 assert(PyUnicode_Check(rep));
5983 if (PyUnicode_READY(rep) < 0)
5984 goto error;
5985 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5986 if (!PyUnicode_IS_ASCII(rep)) {
5987 raise_encode_exception(&exc, encoding,
5988 str, pos - 1, pos,
5989 "surrogates not allowed");
5990 goto error;
5991 }
5992 }
5993
5994 /* two bytes are reserved for each surrogate */
5995 if (moreunits > 1) {
5996 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005997 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005998 /* integer overflow */
5999 PyErr_NoMemory();
6000 goto error;
6001 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006002 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006003 goto error;
6004 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6005 }
6006
6007 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006008 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006009 out += moreunits;
6010 } else /* rep is unicode */ {
6011 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6012 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6013 &out, native_ordering);
6014 }
6015
6016 Py_CLEAR(rep);
6017 }
6018
6019 /* Cut back to size actually needed. This is necessary for, for example,
6020 encoding of a string containing isolated surrogates and the 'ignore' handler
6021 is used. */
6022 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6023 if (nsize != PyBytes_GET_SIZE(v))
6024 _PyBytes_Resize(&v, nsize);
6025 Py_XDECREF(errorHandler);
6026 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006027 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006028 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006029 error:
6030 Py_XDECREF(rep);
6031 Py_XDECREF(errorHandler);
6032 Py_XDECREF(exc);
6033 Py_XDECREF(v);
6034 return NULL;
6035#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036}
6037
Alexander Belopolsky40018472011-02-26 01:02:56 +00006038PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006039PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6040 Py_ssize_t size,
6041 const char *errors,
6042 int byteorder)
6043{
6044 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006045 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006046 if (tmp == NULL)
6047 return NULL;
6048 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6049 Py_DECREF(tmp);
6050 return result;
6051}
6052
6053PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006054PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006056 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057}
6058
6059/* --- Unicode Escape Codec ----------------------------------------------- */
6060
Fredrik Lundh06d12682001-01-24 07:59:11 +00006061static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006062
Alexander Belopolsky40018472011-02-26 01:02:56 +00006063PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006064_PyUnicode_DecodeUnicodeEscape(const char *s,
6065 Py_ssize_t size,
6066 const char *errors,
6067 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006069 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006070 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006072 PyObject *errorHandler = NULL;
6073 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006074
Eric V. Smith42454af2016-10-31 09:22:08 -04006075 // so we can remember if we've seen an invalid escape char or not
6076 *first_invalid_escape = NULL;
6077
Victor Stinner62ec3312016-09-06 17:04:34 -07006078 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006079 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006080 }
6081 /* Escaped strings will always be longer than the resulting
6082 Unicode string, so we start with size here and then reduce the
6083 length after conversion to the true value.
6084 (but if the error callback returns a long replacement string
6085 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006086 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006087 writer.min_length = size;
6088 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6089 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006090 }
6091
Guido van Rossumd57fd912000-03-10 22:53:23 +00006092 end = s + size;
6093 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006094 unsigned char c = (unsigned char) *s++;
6095 Py_UCS4 ch;
6096 int count;
6097 Py_ssize_t startinpos;
6098 Py_ssize_t endinpos;
6099 const char *message;
6100
6101#define WRITE_ASCII_CHAR(ch) \
6102 do { \
6103 assert(ch <= 127); \
6104 assert(writer.pos < writer.size); \
6105 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6106 } while(0)
6107
6108#define WRITE_CHAR(ch) \
6109 do { \
6110 if (ch <= writer.maxchar) { \
6111 assert(writer.pos < writer.size); \
6112 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6113 } \
6114 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6115 goto onError; \
6116 } \
6117 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118
6119 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006120 if (c != '\\') {
6121 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006122 continue;
6123 }
6124
Victor Stinner62ec3312016-09-06 17:04:34 -07006125 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006126 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006127 if (s >= end) {
6128 message = "\\ at end of string";
6129 goto error;
6130 }
6131 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006132
Victor Stinner62ec3312016-09-06 17:04:34 -07006133 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006134 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006135
Benjamin Peterson29060642009-01-31 22:14:21 +00006136 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006137 case '\n': continue;
6138 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6139 case '\'': WRITE_ASCII_CHAR('\''); continue;
6140 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6141 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006142 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006143 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6144 case 't': WRITE_ASCII_CHAR('\t'); continue;
6145 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6146 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006147 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006148 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006149 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006150 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151
Benjamin Peterson29060642009-01-31 22:14:21 +00006152 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 case '0': case '1': case '2': case '3':
6154 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006155 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006156 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006157 ch = (ch<<3) + *s++ - '0';
6158 if (s < end && '0' <= *s && *s <= '7') {
6159 ch = (ch<<3) + *s++ - '0';
6160 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006162 WRITE_CHAR(ch);
6163 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006164
Benjamin Peterson29060642009-01-31 22:14:21 +00006165 /* hex escapes */
6166 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006168 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006169 message = "truncated \\xXX escape";
6170 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171
Benjamin Peterson29060642009-01-31 22:14:21 +00006172 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006174 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006175 message = "truncated \\uXXXX escape";
6176 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006177
Benjamin Peterson29060642009-01-31 22:14:21 +00006178 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006179 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006180 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006181 message = "truncated \\UXXXXXXXX escape";
6182 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006183 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006184 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006185 ch <<= 4;
6186 if (c >= '0' && c <= '9') {
6187 ch += c - '0';
6188 }
6189 else if (c >= 'a' && c <= 'f') {
6190 ch += c - ('a' - 10);
6191 }
6192 else if (c >= 'A' && c <= 'F') {
6193 ch += c - ('A' - 10);
6194 }
6195 else {
6196 break;
6197 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006198 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006199 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006200 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006201 }
6202
6203 /* when we get here, ch is a 32-bit unicode character */
6204 if (ch > MAX_UNICODE) {
6205 message = "illegal Unicode character";
6206 goto error;
6207 }
6208
6209 WRITE_CHAR(ch);
6210 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006211
Benjamin Peterson29060642009-01-31 22:14:21 +00006212 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006213 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006214 if (ucnhash_CAPI == NULL) {
6215 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006216 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6217 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006218 if (ucnhash_CAPI == NULL) {
6219 PyErr_SetString(
6220 PyExc_UnicodeError,
6221 "\\N escapes not supported (can't load unicodedata module)"
6222 );
6223 goto onError;
6224 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006225 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006226
6227 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006228 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006229 const char *start = ++s;
6230 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006231 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006232 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006233 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006234 namelen = s - start;
6235 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006236 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006237 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006238 ch = 0xffffffff; /* in case 'getcode' messes up */
6239 if (namelen <= INT_MAX &&
6240 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6241 &ch, 0)) {
6242 assert(ch <= MAX_UNICODE);
6243 WRITE_CHAR(ch);
6244 continue;
6245 }
6246 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006247 }
6248 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006249 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006250
6251 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006252 if (*first_invalid_escape == NULL) {
6253 *first_invalid_escape = s-1; /* Back up one char, since we've
6254 already incremented s. */
6255 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006256 WRITE_ASCII_CHAR('\\');
6257 WRITE_CHAR(c);
6258 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006259 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006260
6261 error:
6262 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006263 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006264 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006265 errors, &errorHandler,
6266 "unicodeescape", message,
6267 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006268 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006269 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006270 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006271 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006272
6273#undef WRITE_ASCII_CHAR
6274#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006276
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006277 Py_XDECREF(errorHandler);
6278 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006279 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006280
Benjamin Peterson29060642009-01-31 22:14:21 +00006281 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006282 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006283 Py_XDECREF(errorHandler);
6284 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285 return NULL;
6286}
6287
Eric V. Smith42454af2016-10-31 09:22:08 -04006288PyObject *
6289PyUnicode_DecodeUnicodeEscape(const char *s,
6290 Py_ssize_t size,
6291 const char *errors)
6292{
6293 const char *first_invalid_escape;
6294 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6295 &first_invalid_escape);
6296 if (result == NULL)
6297 return NULL;
6298 if (first_invalid_escape != NULL) {
6299 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6300 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006301 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006302 Py_DECREF(result);
6303 return NULL;
6304 }
6305 }
6306 return result;
6307}
6308
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006309/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006310
Alexander Belopolsky40018472011-02-26 01:02:56 +00006311PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006312PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006314 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006315 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006317 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006318 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006319 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320
Ezio Melottie7f90372012-10-05 03:33:31 +03006321 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006322 escape.
6323
Ezio Melottie7f90372012-10-05 03:33:31 +03006324 For UCS1 strings it's '\xxx', 4 bytes per source character.
6325 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6326 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006327 */
6328
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006329 if (!PyUnicode_Check(unicode)) {
6330 PyErr_BadArgument();
6331 return NULL;
6332 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006333 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006334 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006335 }
Victor Stinner358af132015-10-12 22:36:57 +02006336
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006337 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006338 if (len == 0) {
6339 return PyBytes_FromStringAndSize(NULL, 0);
6340 }
6341
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006342 kind = PyUnicode_KIND(unicode);
6343 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006344 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6345 bytes, and 1 byte characters 4. */
6346 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006347 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006348 return PyErr_NoMemory();
6349 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006350 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006351 if (repr == NULL) {
6352 return NULL;
6353 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006354
Victor Stinner62ec3312016-09-06 17:04:34 -07006355 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006356 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006357 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006358
Victor Stinner62ec3312016-09-06 17:04:34 -07006359 /* U+0000-U+00ff range */
6360 if (ch < 0x100) {
6361 if (ch >= ' ' && ch < 127) {
6362 if (ch != '\\') {
6363 /* Copy printable US ASCII as-is */
6364 *p++ = (char) ch;
6365 }
6366 /* Escape backslashes */
6367 else {
6368 *p++ = '\\';
6369 *p++ = '\\';
6370 }
6371 }
Victor Stinner358af132015-10-12 22:36:57 +02006372
Victor Stinner62ec3312016-09-06 17:04:34 -07006373 /* Map special whitespace to '\t', \n', '\r' */
6374 else if (ch == '\t') {
6375 *p++ = '\\';
6376 *p++ = 't';
6377 }
6378 else if (ch == '\n') {
6379 *p++ = '\\';
6380 *p++ = 'n';
6381 }
6382 else if (ch == '\r') {
6383 *p++ = '\\';
6384 *p++ = 'r';
6385 }
6386
6387 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6388 else {
6389 *p++ = '\\';
6390 *p++ = 'x';
6391 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6392 *p++ = Py_hexdigits[ch & 0x000F];
6393 }
Tim Petersced69f82003-09-16 20:30:58 +00006394 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006395 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006396 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397 *p++ = '\\';
6398 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006399 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6400 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6401 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6402 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006403 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006404 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6405 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006406
Victor Stinner62ec3312016-09-06 17:04:34 -07006407 /* Make sure that the first two digits are zero */
6408 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006409 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006410 *p++ = 'U';
6411 *p++ = '0';
6412 *p++ = '0';
6413 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6414 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6415 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6416 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6417 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6418 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006419 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421
Victor Stinner62ec3312016-09-06 17:04:34 -07006422 assert(p - PyBytes_AS_STRING(repr) > 0);
6423 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6424 return NULL;
6425 }
6426 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006427}
6428
Alexander Belopolsky40018472011-02-26 01:02:56 +00006429PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006430PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6431 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006432{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006433 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006434 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006435 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006437 }
6438
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006439 result = PyUnicode_AsUnicodeEscapeString(tmp);
6440 Py_DECREF(tmp);
6441 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442}
6443
6444/* --- Raw Unicode Escape Codec ------------------------------------------- */
6445
Alexander Belopolsky40018472011-02-26 01:02:56 +00006446PyObject *
6447PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006448 Py_ssize_t size,
6449 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006450{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006451 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006452 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006454 PyObject *errorHandler = NULL;
6455 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006456
Victor Stinner62ec3312016-09-06 17:04:34 -07006457 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006458 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006459 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006460
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461 /* Escaped strings will always be longer than the resulting
6462 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006463 length after conversion to the true value. (But decoding error
6464 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006465 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006466 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006467 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6468 goto onError;
6469 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006470
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471 end = s + size;
6472 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006473 unsigned char c = (unsigned char) *s++;
6474 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006475 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006476 Py_ssize_t startinpos;
6477 Py_ssize_t endinpos;
6478 const char *message;
6479
6480#define WRITE_CHAR(ch) \
6481 do { \
6482 if (ch <= writer.maxchar) { \
6483 assert(writer.pos < writer.size); \
6484 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6485 } \
6486 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6487 goto onError; \
6488 } \
6489 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490
Benjamin Peterson29060642009-01-31 22:14:21 +00006491 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006492 if (c != '\\' || s >= end) {
6493 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006494 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006495 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006496
Victor Stinner62ec3312016-09-06 17:04:34 -07006497 c = (unsigned char) *s++;
6498 if (c == 'u') {
6499 count = 4;
6500 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006501 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006502 else if (c == 'U') {
6503 count = 8;
6504 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006505 }
6506 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006507 assert(writer.pos < writer.size);
6508 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6509 WRITE_CHAR(c);
6510 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006511 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006512 startinpos = s - starts - 2;
6513
6514 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6515 for (ch = 0; count && s < end; ++s, --count) {
6516 c = (unsigned char)*s;
6517 ch <<= 4;
6518 if (c >= '0' && c <= '9') {
6519 ch += c - '0';
6520 }
6521 else if (c >= 'a' && c <= 'f') {
6522 ch += c - ('a' - 10);
6523 }
6524 else if (c >= 'A' && c <= 'F') {
6525 ch += c - ('A' - 10);
6526 }
6527 else {
6528 break;
6529 }
6530 }
6531 if (!count) {
6532 if (ch <= MAX_UNICODE) {
6533 WRITE_CHAR(ch);
6534 continue;
6535 }
6536 message = "\\Uxxxxxxxx out of range";
6537 }
6538
6539 endinpos = s-starts;
6540 writer.min_length = end - s + writer.pos;
6541 if (unicode_decode_call_errorhandler_writer(
6542 errors, &errorHandler,
6543 "rawunicodeescape", message,
6544 &starts, &end, &startinpos, &endinpos, &exc, &s,
6545 &writer)) {
6546 goto onError;
6547 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006548 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006549
6550#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006551 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006552 Py_XDECREF(errorHandler);
6553 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006554 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006555
Benjamin Peterson29060642009-01-31 22:14:21 +00006556 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006557 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006558 Py_XDECREF(errorHandler);
6559 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006561
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562}
6563
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006564
Alexander Belopolsky40018472011-02-26 01:02:56 +00006565PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006566PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567{
Victor Stinner62ec3312016-09-06 17:04:34 -07006568 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006569 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006570 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006571 int kind;
6572 void *data;
6573 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006575 if (!PyUnicode_Check(unicode)) {
6576 PyErr_BadArgument();
6577 return NULL;
6578 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006579 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006580 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006581 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006582 kind = PyUnicode_KIND(unicode);
6583 data = PyUnicode_DATA(unicode);
6584 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006585 if (kind == PyUnicode_1BYTE_KIND) {
6586 return PyBytes_FromStringAndSize(data, len);
6587 }
Victor Stinner0e368262011-11-10 20:12:49 +01006588
Victor Stinner62ec3312016-09-06 17:04:34 -07006589 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6590 bytes, and 1 byte characters 4. */
6591 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006592
Victor Stinner62ec3312016-09-06 17:04:34 -07006593 if (len > PY_SSIZE_T_MAX / expandsize) {
6594 return PyErr_NoMemory();
6595 }
6596 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6597 if (repr == NULL) {
6598 return NULL;
6599 }
6600 if (len == 0) {
6601 return repr;
6602 }
6603
6604 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006605 for (pos = 0; pos < len; pos++) {
6606 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006607
Victor Stinner62ec3312016-09-06 17:04:34 -07006608 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6609 if (ch < 0x100) {
6610 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006611 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006612 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006613 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614 *p++ = '\\';
6615 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006616 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6617 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6618 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6619 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006621 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6622 else {
6623 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6624 *p++ = '\\';
6625 *p++ = 'U';
6626 *p++ = '0';
6627 *p++ = '0';
6628 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6629 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6630 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6631 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6632 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6633 *p++ = Py_hexdigits[ch & 15];
6634 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006636
Victor Stinner62ec3312016-09-06 17:04:34 -07006637 assert(p > PyBytes_AS_STRING(repr));
6638 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6639 return NULL;
6640 }
6641 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642}
6643
Alexander Belopolsky40018472011-02-26 01:02:56 +00006644PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006645PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6646 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006648 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006649 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006650 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006651 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006652 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6653 Py_DECREF(tmp);
6654 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655}
6656
6657/* --- Latin-1 Codec ------------------------------------------------------ */
6658
Alexander Belopolsky40018472011-02-26 01:02:56 +00006659PyObject *
6660PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006661 Py_ssize_t size,
6662 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006663{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006665 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006666}
6667
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006668/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006669static void
6670make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006671 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006672 PyObject *unicode,
6673 Py_ssize_t startpos, Py_ssize_t endpos,
6674 const char *reason)
6675{
6676 if (*exceptionObject == NULL) {
6677 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006678 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006679 encoding, unicode, startpos, endpos, reason);
6680 }
6681 else {
6682 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6683 goto onError;
6684 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6685 goto onError;
6686 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6687 goto onError;
6688 return;
6689 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006690 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006691 }
6692}
6693
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006694/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006695static void
6696raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006697 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006698 PyObject *unicode,
6699 Py_ssize_t startpos, Py_ssize_t endpos,
6700 const char *reason)
6701{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006702 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006703 encoding, unicode, startpos, endpos, reason);
6704 if (*exceptionObject != NULL)
6705 PyCodec_StrictErrors(*exceptionObject);
6706}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006707
6708/* error handling callback helper:
6709 build arguments, call the callback and check the arguments,
6710 put the result into newpos and return the replacement string, which
6711 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006712static PyObject *
6713unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006714 PyObject **errorHandler,
6715 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006716 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006717 Py_ssize_t startpos, Py_ssize_t endpos,
6718 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006719{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006720 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006721 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006722 PyObject *restuple;
6723 PyObject *resunicode;
6724
6725 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006726 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006727 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006728 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006729 }
6730
Benjamin Petersonbac79492012-01-14 13:34:47 -05006731 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006732 return NULL;
6733 len = PyUnicode_GET_LENGTH(unicode);
6734
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006735 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006736 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006737 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006738 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006739
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006740 restuple = PyObject_CallFunctionObjArgs(
6741 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006742 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006743 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006744 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006745 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006746 Py_DECREF(restuple);
6747 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006748 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006749 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006750 &resunicode, newpos)) {
6751 Py_DECREF(restuple);
6752 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006753 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006754 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6755 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6756 Py_DECREF(restuple);
6757 return NULL;
6758 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006759 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006760 *newpos = len + *newpos;
6761 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006762 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006763 Py_DECREF(restuple);
6764 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006765 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006766 Py_INCREF(resunicode);
6767 Py_DECREF(restuple);
6768 return resunicode;
6769}
6770
Alexander Belopolsky40018472011-02-26 01:02:56 +00006771static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006772unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006773 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006774 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006775{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006776 /* input state */
6777 Py_ssize_t pos=0, size;
6778 int kind;
6779 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006780 /* pointer into the output */
6781 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006782 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6783 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006784 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006785 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006786 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006787 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006788 /* output object */
6789 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006790
Benjamin Petersonbac79492012-01-14 13:34:47 -05006791 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006792 return NULL;
6793 size = PyUnicode_GET_LENGTH(unicode);
6794 kind = PyUnicode_KIND(unicode);
6795 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006796 /* allocate enough for a simple encoding without
6797 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006798 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006799 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006800
6801 _PyBytesWriter_Init(&writer);
6802 str = _PyBytesWriter_Alloc(&writer, size);
6803 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006804 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006805
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006806 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006807 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006808
Benjamin Peterson29060642009-01-31 22:14:21 +00006809 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006810 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006811 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006812 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006813 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006814 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006815 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006816 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006817 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006818 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006819 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006820 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006821
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006822 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006823 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006824
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006825 /* Only overallocate the buffer if it's not the last write */
6826 writer.overallocate = (collend < size);
6827
Benjamin Peterson29060642009-01-31 22:14:21 +00006828 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006829 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006830 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006831
6832 switch (error_handler) {
6833 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006834 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006835 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006836
6837 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006838 memset(str, '?', collend - collstart);
6839 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006840 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006841 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006842 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006843 break;
Victor Stinner50149202015-09-22 00:26:54 +02006844
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006845 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006846 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006847 writer.min_size -= (collend - collstart);
6848 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006849 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006850 if (str == NULL)
6851 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006852 pos = collend;
6853 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006854
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006855 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006856 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006857 writer.min_size -= (collend - collstart);
6858 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006859 unicode, collstart, collend);
6860 if (str == NULL)
6861 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006862 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006863 break;
Victor Stinner50149202015-09-22 00:26:54 +02006864
Victor Stinnerc3713e92015-09-29 12:32:13 +02006865 case _Py_ERROR_SURROGATEESCAPE:
6866 for (i = collstart; i < collend; ++i) {
6867 ch = PyUnicode_READ(kind, data, i);
6868 if (ch < 0xdc80 || 0xdcff < ch) {
6869 /* Not a UTF-8b surrogate */
6870 break;
6871 }
6872 *str++ = (char)(ch - 0xdc00);
6873 ++pos;
6874 }
6875 if (i >= collend)
6876 break;
6877 collstart = pos;
6878 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006879 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006880
Benjamin Peterson29060642009-01-31 22:14:21 +00006881 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006882 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6883 encoding, reason, unicode, &exc,
6884 collstart, collend, &newpos);
6885 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006886 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006887
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006888 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006889 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006890
Victor Stinner6bd525b2015-10-09 13:10:05 +02006891 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006892 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006893 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006894 PyBytes_AS_STRING(rep),
6895 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006896 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006897 else {
6898 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006899
Victor Stinner6bd525b2015-10-09 13:10:05 +02006900 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006901 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006902
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006903 if (limit == 256 ?
6904 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6905 !PyUnicode_IS_ASCII(rep))
6906 {
6907 /* Not all characters are smaller than limit */
6908 raise_encode_exception(&exc, encoding, unicode,
6909 collstart, collend, reason);
6910 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006911 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006912 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6913 str = _PyBytesWriter_WriteBytes(&writer, str,
6914 PyUnicode_DATA(rep),
6915 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006916 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03006917 if (str == NULL)
6918 goto onError;
6919
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006920 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006921 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006922 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006923
6924 /* If overallocation was disabled, ensure that it was the last
6925 write. Otherwise, we missed an optimization */
6926 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006927 }
6928 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006929
Victor Stinner50149202015-09-22 00:26:54 +02006930 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006931 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006932 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006933
6934 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006935 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006936 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006937 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006938 Py_XDECREF(exc);
6939 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006940}
6941
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006942/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006943PyObject *
6944PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006945 Py_ssize_t size,
6946 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006948 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006949 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006950 if (unicode == NULL)
6951 return NULL;
6952 result = unicode_encode_ucs1(unicode, errors, 256);
6953 Py_DECREF(unicode);
6954 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006955}
6956
Alexander Belopolsky40018472011-02-26 01:02:56 +00006957PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006958_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959{
6960 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006961 PyErr_BadArgument();
6962 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006964 if (PyUnicode_READY(unicode) == -1)
6965 return NULL;
6966 /* Fast path: if it is a one-byte string, construct
6967 bytes object directly. */
6968 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6969 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6970 PyUnicode_GET_LENGTH(unicode));
6971 /* Non-Latin-1 characters present. Defer to above function to
6972 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006973 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006974}
6975
6976PyObject*
6977PyUnicode_AsLatin1String(PyObject *unicode)
6978{
6979 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980}
6981
6982/* --- 7-bit ASCII Codec -------------------------------------------------- */
6983
Alexander Belopolsky40018472011-02-26 01:02:56 +00006984PyObject *
6985PyUnicode_DecodeASCII(const char *s,
6986 Py_ssize_t size,
6987 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006988{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006989 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09006990 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006991 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006992 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006993 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006994
Guido van Rossumd57fd912000-03-10 22:53:23 +00006995 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006996 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006997
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006999 if (size == 1 && (unsigned char)s[0] < 128)
7000 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007001
Inada Naoki770847a2019-06-24 12:30:24 +09007002 // Shortcut for simple case
7003 PyObject *u = PyUnicode_New(size, 127);
7004 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007005 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007006 }
7007 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_DATA(u));
7008 if (outpos == size) {
7009 return u;
7010 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007011
Inada Naoki770847a2019-06-24 12:30:24 +09007012 _PyUnicodeWriter writer;
7013 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007014 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007015
Inada Naoki770847a2019-06-24 12:30:24 +09007016 s += outpos;
7017 int kind = writer.kind;
7018 void *data = writer.data;
7019 Py_ssize_t startinpos, endinpos;
7020
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007021 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007022 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007023 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007024 PyUnicode_WRITE(kind, data, writer.pos, c);
7025 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007026 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007027 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007028 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007029
7030 /* byte outsize range 0x00..0x7f: call the error handler */
7031
7032 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007033 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007034
7035 switch (error_handler)
7036 {
7037 case _Py_ERROR_REPLACE:
7038 case _Py_ERROR_SURROGATEESCAPE:
7039 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007040 but we may switch to UCS2 at the first write */
7041 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7042 goto onError;
7043 kind = writer.kind;
7044 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007045
7046 if (error_handler == _Py_ERROR_REPLACE)
7047 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7048 else
7049 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7050 writer.pos++;
7051 ++s;
7052 break;
7053
7054 case _Py_ERROR_IGNORE:
7055 ++s;
7056 break;
7057
7058 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007059 startinpos = s-starts;
7060 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007061 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007062 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007063 "ascii", "ordinal not in range(128)",
7064 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007065 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007066 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007067 kind = writer.kind;
7068 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007069 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007070 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007071 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007072 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007073 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007074
Benjamin Peterson29060642009-01-31 22:14:21 +00007075 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007076 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007077 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007078 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007079 return NULL;
7080}
7081
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007082/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007083PyObject *
7084PyUnicode_EncodeASCII(const Py_UNICODE *p,
7085 Py_ssize_t size,
7086 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007087{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007088 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007089 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007090 if (unicode == NULL)
7091 return NULL;
7092 result = unicode_encode_ucs1(unicode, errors, 128);
7093 Py_DECREF(unicode);
7094 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095}
7096
Alexander Belopolsky40018472011-02-26 01:02:56 +00007097PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007098_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099{
7100 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007101 PyErr_BadArgument();
7102 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007103 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007104 if (PyUnicode_READY(unicode) == -1)
7105 return NULL;
7106 /* Fast path: if it is an ASCII-only string, construct bytes object
7107 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007108 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007109 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7110 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007111 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007112}
7113
7114PyObject *
7115PyUnicode_AsASCIIString(PyObject *unicode)
7116{
7117 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007118}
7119
Steve Dowercc16be82016-09-08 10:35:16 -07007120#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007121
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007122/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007123
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007124#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007125#define NEED_RETRY
7126#endif
7127
Victor Stinner3a50e702011-10-18 21:21:00 +02007128#ifndef WC_ERR_INVALID_CHARS
7129# define WC_ERR_INVALID_CHARS 0x0080
7130#endif
7131
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007132static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007133code_page_name(UINT code_page, PyObject **obj)
7134{
7135 *obj = NULL;
7136 if (code_page == CP_ACP)
7137 return "mbcs";
7138 if (code_page == CP_UTF7)
7139 return "CP_UTF7";
7140 if (code_page == CP_UTF8)
7141 return "CP_UTF8";
7142
7143 *obj = PyBytes_FromFormat("cp%u", code_page);
7144 if (*obj == NULL)
7145 return NULL;
7146 return PyBytes_AS_STRING(*obj);
7147}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007148
Victor Stinner3a50e702011-10-18 21:21:00 +02007149static DWORD
7150decode_code_page_flags(UINT code_page)
7151{
7152 if (code_page == CP_UTF7) {
7153 /* The CP_UTF7 decoder only supports flags=0 */
7154 return 0;
7155 }
7156 else
7157 return MB_ERR_INVALID_CHARS;
7158}
7159
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007160/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007161 * Decode a byte string from a Windows code page into unicode object in strict
7162 * mode.
7163 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007164 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7165 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007166 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007167static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007168decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007169 wchar_t **buf,
7170 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007171 const char *in,
7172 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007173{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007174 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007175 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007176 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007177
7178 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007179 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007180 while ((outsize = MultiByteToWideChar(code_page, flags,
7181 in, insize, NULL, 0)) <= 0)
7182 {
7183 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7184 goto error;
7185 }
7186 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7187 flags = 0;
7188 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007189
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007190 /* Extend a wchar_t* buffer */
7191 Py_ssize_t n = *bufsize; /* Get the current length */
7192 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7193 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007194 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007195 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007196
7197 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007198 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7199 if (outsize <= 0)
7200 goto error;
7201 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007202
Victor Stinner3a50e702011-10-18 21:21:00 +02007203error:
7204 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7205 return -2;
7206 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007207 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007208}
7209
Victor Stinner3a50e702011-10-18 21:21:00 +02007210/*
7211 * Decode a byte string from a code page into unicode object with an error
7212 * handler.
7213 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007214 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007215 * UnicodeDecodeError exception and returns -1 on error.
7216 */
7217static int
7218decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007219 wchar_t **buf,
7220 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007221 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007222 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007223{
7224 const char *startin = in;
7225 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007226 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007227 /* Ideally, we should get reason from FormatMessage. This is the Windows
7228 2000 English version of the message. */
7229 const char *reason = "No mapping for the Unicode character exists "
7230 "in the target code page.";
7231 /* each step cannot decode more than 1 character, but a character can be
7232 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007233 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007234 int insize;
7235 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007236 PyObject *errorHandler = NULL;
7237 PyObject *exc = NULL;
7238 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007239 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007240 DWORD err;
7241 int ret = -1;
7242
7243 assert(size > 0);
7244
7245 encoding = code_page_name(code_page, &encoding_obj);
7246 if (encoding == NULL)
7247 return -1;
7248
Victor Stinner7d00cc12014-03-17 23:08:06 +01007249 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007250 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7251 UnicodeDecodeError. */
7252 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7253 if (exc != NULL) {
7254 PyCodec_StrictErrors(exc);
7255 Py_CLEAR(exc);
7256 }
7257 goto error;
7258 }
7259
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007260 /* Extend a wchar_t* buffer */
7261 Py_ssize_t n = *bufsize; /* Get the current length */
7262 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7263 PyErr_NoMemory();
7264 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007265 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007266 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7267 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007268 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007269 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007270
7271 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007272 while (in < endin)
7273 {
7274 /* Decode a character */
7275 insize = 1;
7276 do
7277 {
7278 outsize = MultiByteToWideChar(code_page, flags,
7279 in, insize,
7280 buffer, Py_ARRAY_LENGTH(buffer));
7281 if (outsize > 0)
7282 break;
7283 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007284 if (err == ERROR_INVALID_FLAGS && flags) {
7285 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7286 flags = 0;
7287 continue;
7288 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007289 if (err != ERROR_NO_UNICODE_TRANSLATION
7290 && err != ERROR_INSUFFICIENT_BUFFER)
7291 {
7292 PyErr_SetFromWindowsErr(0);
7293 goto error;
7294 }
7295 insize++;
7296 }
7297 /* 4=maximum length of a UTF-8 sequence */
7298 while (insize <= 4 && (in + insize) <= endin);
7299
7300 if (outsize <= 0) {
7301 Py_ssize_t startinpos, endinpos, outpos;
7302
Victor Stinner7d00cc12014-03-17 23:08:06 +01007303 /* last character in partial decode? */
7304 if (in + insize >= endin && !final)
7305 break;
7306
Victor Stinner3a50e702011-10-18 21:21:00 +02007307 startinpos = in - startin;
7308 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007309 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007310 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007311 errors, &errorHandler,
7312 encoding, reason,
7313 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007314 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007315 {
7316 goto error;
7317 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007318 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007319 }
7320 else {
7321 in += insize;
7322 memcpy(out, buffer, outsize * sizeof(wchar_t));
7323 out += outsize;
7324 }
7325 }
7326
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007327 /* Shrink the buffer */
7328 assert(out - *buf <= *bufsize);
7329 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007330 /* (in - startin) <= size and size is an int */
7331 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007332
7333error:
7334 Py_XDECREF(encoding_obj);
7335 Py_XDECREF(errorHandler);
7336 Py_XDECREF(exc);
7337 return ret;
7338}
7339
Victor Stinner3a50e702011-10-18 21:21:00 +02007340static PyObject *
7341decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007342 const char *s, Py_ssize_t size,
7343 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007344{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007345 wchar_t *buf = NULL;
7346 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007347 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007348
Victor Stinner3a50e702011-10-18 21:21:00 +02007349 if (code_page < 0) {
7350 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7351 return NULL;
7352 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007353 if (size < 0) {
7354 PyErr_BadInternalCall();
7355 return NULL;
7356 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007357
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007358 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007359 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007360
Victor Stinner76a31a62011-11-04 00:05:13 +01007361 do
7362 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007363#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007364 if (size > INT_MAX) {
7365 chunk_size = INT_MAX;
7366 final = 0;
7367 done = 0;
7368 }
7369 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007370#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007371 {
7372 chunk_size = (int)size;
7373 final = (consumed == NULL);
7374 done = 1;
7375 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007376
Victor Stinner76a31a62011-11-04 00:05:13 +01007377 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007378 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007379 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007380 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007381 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007382
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007383 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007384 s, chunk_size);
7385 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007386 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007387 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007388 errors, final);
7389 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007390
7391 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007392 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007393 return NULL;
7394 }
7395
7396 if (consumed)
7397 *consumed += converted;
7398
7399 s += converted;
7400 size -= converted;
7401 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007402
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007403 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7404 PyMem_Free(buf);
7405 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007406}
7407
Alexander Belopolsky40018472011-02-26 01:02:56 +00007408PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007409PyUnicode_DecodeCodePageStateful(int code_page,
7410 const char *s,
7411 Py_ssize_t size,
7412 const char *errors,
7413 Py_ssize_t *consumed)
7414{
7415 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7416}
7417
7418PyObject *
7419PyUnicode_DecodeMBCSStateful(const char *s,
7420 Py_ssize_t size,
7421 const char *errors,
7422 Py_ssize_t *consumed)
7423{
7424 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7425}
7426
7427PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007428PyUnicode_DecodeMBCS(const char *s,
7429 Py_ssize_t size,
7430 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007431{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007432 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7433}
7434
Victor Stinner3a50e702011-10-18 21:21:00 +02007435static DWORD
7436encode_code_page_flags(UINT code_page, const char *errors)
7437{
7438 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007439 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007440 }
7441 else if (code_page == CP_UTF7) {
7442 /* CP_UTF7 only supports flags=0 */
7443 return 0;
7444 }
7445 else {
7446 if (errors != NULL && strcmp(errors, "replace") == 0)
7447 return 0;
7448 else
7449 return WC_NO_BEST_FIT_CHARS;
7450 }
7451}
7452
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007453/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007454 * Encode a Unicode string to a Windows code page into a byte string in strict
7455 * mode.
7456 *
7457 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007458 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007459 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007460static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007461encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007462 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007463 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007464{
Victor Stinner554f3f02010-06-16 23:33:54 +00007465 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007466 BOOL *pusedDefaultChar = &usedDefaultChar;
7467 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007468 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007469 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007470 const DWORD flags = encode_code_page_flags(code_page, NULL);
7471 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007472 /* Create a substring so that we can get the UTF-16 representation
7473 of just the slice under consideration. */
7474 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007475
Martin v. Löwis3d325192011-11-04 18:23:06 +01007476 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007477
Victor Stinner3a50e702011-10-18 21:21:00 +02007478 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007479 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007480 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007481 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007482
Victor Stinner2fc507f2011-11-04 20:06:39 +01007483 substring = PyUnicode_Substring(unicode, offset, offset+len);
7484 if (substring == NULL)
7485 return -1;
7486 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7487 if (p == NULL) {
7488 Py_DECREF(substring);
7489 return -1;
7490 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007491 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007492
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007493 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007494 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007495 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007496 NULL, 0,
7497 NULL, pusedDefaultChar);
7498 if (outsize <= 0)
7499 goto error;
7500 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007501 if (pusedDefaultChar && *pusedDefaultChar) {
7502 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007503 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007504 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007505
Victor Stinner3a50e702011-10-18 21:21:00 +02007506 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007507 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007508 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007509 if (*outbytes == NULL) {
7510 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007511 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007512 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007513 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007514 }
7515 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007516 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007517 const Py_ssize_t n = PyBytes_Size(*outbytes);
7518 if (outsize > PY_SSIZE_T_MAX - n) {
7519 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007520 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007521 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007522 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007523 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7524 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007525 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007526 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007527 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007528 }
7529
7530 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007531 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007532 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007533 out, outsize,
7534 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007535 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007536 if (outsize <= 0)
7537 goto error;
7538 if (pusedDefaultChar && *pusedDefaultChar)
7539 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007540 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007541
Victor Stinner3a50e702011-10-18 21:21:00 +02007542error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007543 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007544 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7545 return -2;
7546 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007547 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007548}
7549
Victor Stinner3a50e702011-10-18 21:21:00 +02007550/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007551 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007552 * error handler.
7553 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007554 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007555 * -1 on other error.
7556 */
7557static int
7558encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007559 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007560 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007561{
Victor Stinner3a50e702011-10-18 21:21:00 +02007562 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007563 Py_ssize_t pos = unicode_offset;
7564 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007565 /* Ideally, we should get reason from FormatMessage. This is the Windows
7566 2000 English version of the message. */
7567 const char *reason = "invalid character";
7568 /* 4=maximum length of a UTF-8 sequence */
7569 char buffer[4];
7570 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7571 Py_ssize_t outsize;
7572 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007573 PyObject *errorHandler = NULL;
7574 PyObject *exc = NULL;
7575 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007576 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007577 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007578 PyObject *rep;
7579 int ret = -1;
7580
7581 assert(insize > 0);
7582
7583 encoding = code_page_name(code_page, &encoding_obj);
7584 if (encoding == NULL)
7585 return -1;
7586
7587 if (errors == NULL || strcmp(errors, "strict") == 0) {
7588 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7589 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007590 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007591 if (exc != NULL) {
7592 PyCodec_StrictErrors(exc);
7593 Py_DECREF(exc);
7594 }
7595 Py_XDECREF(encoding_obj);
7596 return -1;
7597 }
7598
7599 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7600 pusedDefaultChar = &usedDefaultChar;
7601 else
7602 pusedDefaultChar = NULL;
7603
7604 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7605 PyErr_NoMemory();
7606 goto error;
7607 }
7608 outsize = insize * Py_ARRAY_LENGTH(buffer);
7609
7610 if (*outbytes == NULL) {
7611 /* Create string object */
7612 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7613 if (*outbytes == NULL)
7614 goto error;
7615 out = PyBytes_AS_STRING(*outbytes);
7616 }
7617 else {
7618 /* Extend string object */
7619 Py_ssize_t n = PyBytes_Size(*outbytes);
7620 if (n > PY_SSIZE_T_MAX - outsize) {
7621 PyErr_NoMemory();
7622 goto error;
7623 }
7624 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7625 goto error;
7626 out = PyBytes_AS_STRING(*outbytes) + n;
7627 }
7628
7629 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007630 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007631 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007632 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7633 wchar_t chars[2];
7634 int charsize;
7635 if (ch < 0x10000) {
7636 chars[0] = (wchar_t)ch;
7637 charsize = 1;
7638 }
7639 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007640 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7641 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007642 charsize = 2;
7643 }
7644
Victor Stinner3a50e702011-10-18 21:21:00 +02007645 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007646 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007647 buffer, Py_ARRAY_LENGTH(buffer),
7648 NULL, pusedDefaultChar);
7649 if (outsize > 0) {
7650 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7651 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007652 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007653 memcpy(out, buffer, outsize);
7654 out += outsize;
7655 continue;
7656 }
7657 }
7658 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7659 PyErr_SetFromWindowsErr(0);
7660 goto error;
7661 }
7662
Victor Stinner3a50e702011-10-18 21:21:00 +02007663 rep = unicode_encode_call_errorhandler(
7664 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007665 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007666 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007667 if (rep == NULL)
7668 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007669 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007670
7671 if (PyBytes_Check(rep)) {
7672 outsize = PyBytes_GET_SIZE(rep);
7673 if (outsize != 1) {
7674 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7675 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7676 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7677 Py_DECREF(rep);
7678 goto error;
7679 }
7680 out = PyBytes_AS_STRING(*outbytes) + offset;
7681 }
7682 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7683 out += outsize;
7684 }
7685 else {
7686 Py_ssize_t i;
7687 enum PyUnicode_Kind kind;
7688 void *data;
7689
Benjamin Petersonbac79492012-01-14 13:34:47 -05007690 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007691 Py_DECREF(rep);
7692 goto error;
7693 }
7694
7695 outsize = PyUnicode_GET_LENGTH(rep);
7696 if (outsize != 1) {
7697 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7698 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7699 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7700 Py_DECREF(rep);
7701 goto error;
7702 }
7703 out = PyBytes_AS_STRING(*outbytes) + offset;
7704 }
7705 kind = PyUnicode_KIND(rep);
7706 data = PyUnicode_DATA(rep);
7707 for (i=0; i < outsize; i++) {
7708 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7709 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007710 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007711 encoding, unicode,
7712 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007713 "unable to encode error handler result to ASCII");
7714 Py_DECREF(rep);
7715 goto error;
7716 }
7717 *out = (unsigned char)ch;
7718 out++;
7719 }
7720 }
7721 Py_DECREF(rep);
7722 }
7723 /* write a NUL byte */
7724 *out = 0;
7725 outsize = out - PyBytes_AS_STRING(*outbytes);
7726 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7727 if (_PyBytes_Resize(outbytes, outsize) < 0)
7728 goto error;
7729 ret = 0;
7730
7731error:
7732 Py_XDECREF(encoding_obj);
7733 Py_XDECREF(errorHandler);
7734 Py_XDECREF(exc);
7735 return ret;
7736}
7737
Victor Stinner3a50e702011-10-18 21:21:00 +02007738static PyObject *
7739encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007740 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007741 const char *errors)
7742{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007743 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007744 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007745 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007746 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007747
Victor Stinner29dacf22015-01-26 16:41:32 +01007748 if (!PyUnicode_Check(unicode)) {
7749 PyErr_BadArgument();
7750 return NULL;
7751 }
7752
Benjamin Petersonbac79492012-01-14 13:34:47 -05007753 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007754 return NULL;
7755 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007756
Victor Stinner3a50e702011-10-18 21:21:00 +02007757 if (code_page < 0) {
7758 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7759 return NULL;
7760 }
7761
Martin v. Löwis3d325192011-11-04 18:23:06 +01007762 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007763 return PyBytes_FromStringAndSize(NULL, 0);
7764
Victor Stinner7581cef2011-11-03 22:32:33 +01007765 offset = 0;
7766 do
7767 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007768#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007769 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007770 chunks. */
7771 if (len > INT_MAX/2) {
7772 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007773 done = 0;
7774 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007775 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007776#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007777 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007778 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007779 done = 1;
7780 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007781
Victor Stinner76a31a62011-11-04 00:05:13 +01007782 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007783 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007784 errors);
7785 if (ret == -2)
7786 ret = encode_code_page_errors(code_page, &outbytes,
7787 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007788 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007789 if (ret < 0) {
7790 Py_XDECREF(outbytes);
7791 return NULL;
7792 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007793
Victor Stinner7581cef2011-11-03 22:32:33 +01007794 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007795 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007796 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007797
Victor Stinner3a50e702011-10-18 21:21:00 +02007798 return outbytes;
7799}
7800
7801PyObject *
7802PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7803 Py_ssize_t size,
7804 const char *errors)
7805{
Victor Stinner7581cef2011-11-03 22:32:33 +01007806 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007807 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007808 if (unicode == NULL)
7809 return NULL;
7810 res = encode_code_page(CP_ACP, unicode, errors);
7811 Py_DECREF(unicode);
7812 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007813}
7814
7815PyObject *
7816PyUnicode_EncodeCodePage(int code_page,
7817 PyObject *unicode,
7818 const char *errors)
7819{
Victor Stinner7581cef2011-11-03 22:32:33 +01007820 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007821}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007822
Alexander Belopolsky40018472011-02-26 01:02:56 +00007823PyObject *
7824PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007825{
Victor Stinner7581cef2011-11-03 22:32:33 +01007826 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007827}
7828
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007829#undef NEED_RETRY
7830
Steve Dowercc16be82016-09-08 10:35:16 -07007831#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007832
Guido van Rossumd57fd912000-03-10 22:53:23 +00007833/* --- Character Mapping Codec -------------------------------------------- */
7834
Victor Stinnerfb161b12013-04-18 01:44:27 +02007835static int
7836charmap_decode_string(const char *s,
7837 Py_ssize_t size,
7838 PyObject *mapping,
7839 const char *errors,
7840 _PyUnicodeWriter *writer)
7841{
7842 const char *starts = s;
7843 const char *e;
7844 Py_ssize_t startinpos, endinpos;
7845 PyObject *errorHandler = NULL, *exc = NULL;
7846 Py_ssize_t maplen;
7847 enum PyUnicode_Kind mapkind;
7848 void *mapdata;
7849 Py_UCS4 x;
7850 unsigned char ch;
7851
7852 if (PyUnicode_READY(mapping) == -1)
7853 return -1;
7854
7855 maplen = PyUnicode_GET_LENGTH(mapping);
7856 mapdata = PyUnicode_DATA(mapping);
7857 mapkind = PyUnicode_KIND(mapping);
7858
7859 e = s + size;
7860
7861 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7862 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7863 * is disabled in encoding aliases, latin1 is preferred because
7864 * its implementation is faster. */
7865 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7866 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7867 Py_UCS4 maxchar = writer->maxchar;
7868
7869 assert (writer->kind == PyUnicode_1BYTE_KIND);
7870 while (s < e) {
7871 ch = *s;
7872 x = mapdata_ucs1[ch];
7873 if (x > maxchar) {
7874 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7875 goto onError;
7876 maxchar = writer->maxchar;
7877 outdata = (Py_UCS1 *)writer->data;
7878 }
7879 outdata[writer->pos] = x;
7880 writer->pos++;
7881 ++s;
7882 }
7883 return 0;
7884 }
7885
7886 while (s < e) {
7887 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7888 enum PyUnicode_Kind outkind = writer->kind;
7889 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7890 if (outkind == PyUnicode_1BYTE_KIND) {
7891 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7892 Py_UCS4 maxchar = writer->maxchar;
7893 while (s < e) {
7894 ch = *s;
7895 x = mapdata_ucs2[ch];
7896 if (x > maxchar)
7897 goto Error;
7898 outdata[writer->pos] = x;
7899 writer->pos++;
7900 ++s;
7901 }
7902 break;
7903 }
7904 else if (outkind == PyUnicode_2BYTE_KIND) {
7905 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7906 while (s < e) {
7907 ch = *s;
7908 x = mapdata_ucs2[ch];
7909 if (x == 0xFFFE)
7910 goto Error;
7911 outdata[writer->pos] = x;
7912 writer->pos++;
7913 ++s;
7914 }
7915 break;
7916 }
7917 }
7918 ch = *s;
7919
7920 if (ch < maplen)
7921 x = PyUnicode_READ(mapkind, mapdata, ch);
7922 else
7923 x = 0xfffe; /* invalid value */
7924Error:
7925 if (x == 0xfffe)
7926 {
7927 /* undefined mapping */
7928 startinpos = s-starts;
7929 endinpos = startinpos+1;
7930 if (unicode_decode_call_errorhandler_writer(
7931 errors, &errorHandler,
7932 "charmap", "character maps to <undefined>",
7933 &starts, &e, &startinpos, &endinpos, &exc, &s,
7934 writer)) {
7935 goto onError;
7936 }
7937 continue;
7938 }
7939
7940 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7941 goto onError;
7942 ++s;
7943 }
7944 Py_XDECREF(errorHandler);
7945 Py_XDECREF(exc);
7946 return 0;
7947
7948onError:
7949 Py_XDECREF(errorHandler);
7950 Py_XDECREF(exc);
7951 return -1;
7952}
7953
7954static int
7955charmap_decode_mapping(const char *s,
7956 Py_ssize_t size,
7957 PyObject *mapping,
7958 const char *errors,
7959 _PyUnicodeWriter *writer)
7960{
7961 const char *starts = s;
7962 const char *e;
7963 Py_ssize_t startinpos, endinpos;
7964 PyObject *errorHandler = NULL, *exc = NULL;
7965 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007966 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007967
7968 e = s + size;
7969
7970 while (s < e) {
7971 ch = *s;
7972
7973 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7974 key = PyLong_FromLong((long)ch);
7975 if (key == NULL)
7976 goto onError;
7977
7978 item = PyObject_GetItem(mapping, key);
7979 Py_DECREF(key);
7980 if (item == NULL) {
7981 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7982 /* No mapping found means: mapping is undefined. */
7983 PyErr_Clear();
7984 goto Undefined;
7985 } else
7986 goto onError;
7987 }
7988
7989 /* Apply mapping */
7990 if (item == Py_None)
7991 goto Undefined;
7992 if (PyLong_Check(item)) {
7993 long value = PyLong_AS_LONG(item);
7994 if (value == 0xFFFE)
7995 goto Undefined;
7996 if (value < 0 || value > MAX_UNICODE) {
7997 PyErr_Format(PyExc_TypeError,
7998 "character mapping must be in range(0x%lx)",
7999 (unsigned long)MAX_UNICODE + 1);
8000 goto onError;
8001 }
8002
8003 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8004 goto onError;
8005 }
8006 else if (PyUnicode_Check(item)) {
8007 if (PyUnicode_READY(item) == -1)
8008 goto onError;
8009 if (PyUnicode_GET_LENGTH(item) == 1) {
8010 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8011 if (value == 0xFFFE)
8012 goto Undefined;
8013 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8014 goto onError;
8015 }
8016 else {
8017 writer->overallocate = 1;
8018 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8019 goto onError;
8020 }
8021 }
8022 else {
8023 /* wrong return value */
8024 PyErr_SetString(PyExc_TypeError,
8025 "character mapping must return integer, None or str");
8026 goto onError;
8027 }
8028 Py_CLEAR(item);
8029 ++s;
8030 continue;
8031
8032Undefined:
8033 /* undefined mapping */
8034 Py_CLEAR(item);
8035 startinpos = s-starts;
8036 endinpos = startinpos+1;
8037 if (unicode_decode_call_errorhandler_writer(
8038 errors, &errorHandler,
8039 "charmap", "character maps to <undefined>",
8040 &starts, &e, &startinpos, &endinpos, &exc, &s,
8041 writer)) {
8042 goto onError;
8043 }
8044 }
8045 Py_XDECREF(errorHandler);
8046 Py_XDECREF(exc);
8047 return 0;
8048
8049onError:
8050 Py_XDECREF(item);
8051 Py_XDECREF(errorHandler);
8052 Py_XDECREF(exc);
8053 return -1;
8054}
8055
Alexander Belopolsky40018472011-02-26 01:02:56 +00008056PyObject *
8057PyUnicode_DecodeCharmap(const char *s,
8058 Py_ssize_t size,
8059 PyObject *mapping,
8060 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008061{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008062 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008063
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064 /* Default to Latin-1 */
8065 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008066 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008067
Guido van Rossumd57fd912000-03-10 22:53:23 +00008068 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008069 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008070 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008071 writer.min_length = size;
8072 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008073 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008074
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008075 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008076 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8077 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008078 }
8079 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008080 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8081 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008082 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008083 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008084
Benjamin Peterson29060642009-01-31 22:14:21 +00008085 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008086 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008087 return NULL;
8088}
8089
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008090/* Charmap encoding: the lookup table */
8091
Alexander Belopolsky40018472011-02-26 01:02:56 +00008092struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008093 PyObject_HEAD
8094 unsigned char level1[32];
8095 int count2, count3;
8096 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008097};
8098
8099static PyObject*
8100encoding_map_size(PyObject *obj, PyObject* args)
8101{
8102 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008103 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008104 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008105}
8106
8107static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008108 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008109 PyDoc_STR("Return the size (in bytes) of this object") },
8110 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008111};
8112
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008113static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008114 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008115 "EncodingMap", /*tp_name*/
8116 sizeof(struct encoding_map), /*tp_basicsize*/
8117 0, /*tp_itemsize*/
8118 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008119 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008120 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008121 0, /*tp_getattr*/
8122 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008123 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008124 0, /*tp_repr*/
8125 0, /*tp_as_number*/
8126 0, /*tp_as_sequence*/
8127 0, /*tp_as_mapping*/
8128 0, /*tp_hash*/
8129 0, /*tp_call*/
8130 0, /*tp_str*/
8131 0, /*tp_getattro*/
8132 0, /*tp_setattro*/
8133 0, /*tp_as_buffer*/
8134 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8135 0, /*tp_doc*/
8136 0, /*tp_traverse*/
8137 0, /*tp_clear*/
8138 0, /*tp_richcompare*/
8139 0, /*tp_weaklistoffset*/
8140 0, /*tp_iter*/
8141 0, /*tp_iternext*/
8142 encoding_map_methods, /*tp_methods*/
8143 0, /*tp_members*/
8144 0, /*tp_getset*/
8145 0, /*tp_base*/
8146 0, /*tp_dict*/
8147 0, /*tp_descr_get*/
8148 0, /*tp_descr_set*/
8149 0, /*tp_dictoffset*/
8150 0, /*tp_init*/
8151 0, /*tp_alloc*/
8152 0, /*tp_new*/
8153 0, /*tp_free*/
8154 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008155};
8156
8157PyObject*
8158PyUnicode_BuildEncodingMap(PyObject* string)
8159{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008160 PyObject *result;
8161 struct encoding_map *mresult;
8162 int i;
8163 int need_dict = 0;
8164 unsigned char level1[32];
8165 unsigned char level2[512];
8166 unsigned char *mlevel1, *mlevel2, *mlevel3;
8167 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008168 int kind;
8169 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008170 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008171 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008172
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008173 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008174 PyErr_BadArgument();
8175 return NULL;
8176 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008177 kind = PyUnicode_KIND(string);
8178 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008179 length = PyUnicode_GET_LENGTH(string);
8180 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008181 memset(level1, 0xFF, sizeof level1);
8182 memset(level2, 0xFF, sizeof level2);
8183
8184 /* If there isn't a one-to-one mapping of NULL to \0,
8185 or if there are non-BMP characters, we need to use
8186 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008187 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008188 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008189 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008190 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008191 ch = PyUnicode_READ(kind, data, i);
8192 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008193 need_dict = 1;
8194 break;
8195 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008196 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008197 /* unmapped character */
8198 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008199 l1 = ch >> 11;
8200 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008201 if (level1[l1] == 0xFF)
8202 level1[l1] = count2++;
8203 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008204 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008205 }
8206
8207 if (count2 >= 0xFF || count3 >= 0xFF)
8208 need_dict = 1;
8209
8210 if (need_dict) {
8211 PyObject *result = PyDict_New();
8212 PyObject *key, *value;
8213 if (!result)
8214 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008215 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008216 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008217 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008218 if (!key || !value)
8219 goto failed1;
8220 if (PyDict_SetItem(result, key, value) == -1)
8221 goto failed1;
8222 Py_DECREF(key);
8223 Py_DECREF(value);
8224 }
8225 return result;
8226 failed1:
8227 Py_XDECREF(key);
8228 Py_XDECREF(value);
8229 Py_DECREF(result);
8230 return NULL;
8231 }
8232
8233 /* Create a three-level trie */
8234 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8235 16*count2 + 128*count3 - 1);
8236 if (!result)
8237 return PyErr_NoMemory();
8238 PyObject_Init(result, &EncodingMapType);
8239 mresult = (struct encoding_map*)result;
8240 mresult->count2 = count2;
8241 mresult->count3 = count3;
8242 mlevel1 = mresult->level1;
8243 mlevel2 = mresult->level23;
8244 mlevel3 = mresult->level23 + 16*count2;
8245 memcpy(mlevel1, level1, 32);
8246 memset(mlevel2, 0xFF, 16*count2);
8247 memset(mlevel3, 0, 128*count3);
8248 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008249 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008250 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008251 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8252 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008253 /* unmapped character */
8254 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008255 o1 = ch>>11;
8256 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008257 i2 = 16*mlevel1[o1] + o2;
8258 if (mlevel2[i2] == 0xFF)
8259 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008260 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008261 i3 = 128*mlevel2[i2] + o3;
8262 mlevel3[i3] = i;
8263 }
8264 return result;
8265}
8266
8267static int
Victor Stinner22168992011-11-20 17:09:18 +01008268encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008269{
8270 struct encoding_map *map = (struct encoding_map*)mapping;
8271 int l1 = c>>11;
8272 int l2 = (c>>7) & 0xF;
8273 int l3 = c & 0x7F;
8274 int i;
8275
Victor Stinner22168992011-11-20 17:09:18 +01008276 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008277 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008278 if (c == 0)
8279 return 0;
8280 /* level 1*/
8281 i = map->level1[l1];
8282 if (i == 0xFF) {
8283 return -1;
8284 }
8285 /* level 2*/
8286 i = map->level23[16*i+l2];
8287 if (i == 0xFF) {
8288 return -1;
8289 }
8290 /* level 3 */
8291 i = map->level23[16*map->count2 + 128*i + l3];
8292 if (i == 0) {
8293 return -1;
8294 }
8295 return i;
8296}
8297
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008298/* Lookup the character ch in the mapping. If the character
8299 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008300 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008301static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008302charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008303{
Christian Heimes217cfd12007-12-02 14:31:20 +00008304 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008305 PyObject *x;
8306
8307 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008308 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008309 x = PyObject_GetItem(mapping, w);
8310 Py_DECREF(w);
8311 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008312 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8313 /* No mapping found means: mapping is undefined. */
8314 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008315 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008316 } else
8317 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008318 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008319 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008320 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008321 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008322 long value = PyLong_AS_LONG(x);
8323 if (value < 0 || value > 255) {
8324 PyErr_SetString(PyExc_TypeError,
8325 "character mapping must be in range(256)");
8326 Py_DECREF(x);
8327 return NULL;
8328 }
8329 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008330 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008331 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008332 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008333 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008334 /* wrong return value */
8335 PyErr_Format(PyExc_TypeError,
8336 "character mapping must return integer, bytes or None, not %.400s",
8337 x->ob_type->tp_name);
8338 Py_DECREF(x);
8339 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008340 }
8341}
8342
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008343static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008344charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008345{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008346 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8347 /* exponentially overallocate to minimize reallocations */
8348 if (requiredsize < 2*outsize)
8349 requiredsize = 2*outsize;
8350 if (_PyBytes_Resize(outobj, requiredsize))
8351 return -1;
8352 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008353}
8354
Benjamin Peterson14339b62009-01-31 16:36:08 +00008355typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008356 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008357} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008358/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008359 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008360 space is available. Return a new reference to the object that
8361 was put in the output buffer, or Py_None, if the mapping was undefined
8362 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008363 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008364static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008365charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008366 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008367{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008368 PyObject *rep;
8369 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008370 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008371
Christian Heimes90aa7642007-12-19 02:45:37 +00008372 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008373 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008374 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008375 if (res == -1)
8376 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 if (outsize<requiredsize)
8378 if (charmapencode_resize(outobj, outpos, requiredsize))
8379 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008380 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 outstart[(*outpos)++] = (char)res;
8382 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008383 }
8384
8385 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008386 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008388 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 Py_DECREF(rep);
8390 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008391 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008392 if (PyLong_Check(rep)) {
8393 Py_ssize_t requiredsize = *outpos+1;
8394 if (outsize<requiredsize)
8395 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8396 Py_DECREF(rep);
8397 return enc_EXCEPTION;
8398 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008399 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008400 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008401 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 else {
8403 const char *repchars = PyBytes_AS_STRING(rep);
8404 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8405 Py_ssize_t requiredsize = *outpos+repsize;
8406 if (outsize<requiredsize)
8407 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8408 Py_DECREF(rep);
8409 return enc_EXCEPTION;
8410 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008411 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 memcpy(outstart + *outpos, repchars, repsize);
8413 *outpos += repsize;
8414 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008415 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008416 Py_DECREF(rep);
8417 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008418}
8419
8420/* handle an error in PyUnicode_EncodeCharmap
8421 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008422static int
8423charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008424 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008425 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008426 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008427 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008428{
8429 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008430 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008431 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008432 enum PyUnicode_Kind kind;
8433 void *data;
8434 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008435 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008436 Py_ssize_t collstartpos = *inpos;
8437 Py_ssize_t collendpos = *inpos+1;
8438 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008439 const char *encoding = "charmap";
8440 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008441 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008442 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008443 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008444
Benjamin Petersonbac79492012-01-14 13:34:47 -05008445 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008446 return -1;
8447 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008448 /* find all unencodable characters */
8449 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008450 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008451 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008452 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008453 val = encoding_map_lookup(ch, mapping);
8454 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008455 break;
8456 ++collendpos;
8457 continue;
8458 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008459
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008460 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8461 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 if (rep==NULL)
8463 return -1;
8464 else if (rep!=Py_None) {
8465 Py_DECREF(rep);
8466 break;
8467 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008468 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008470 }
8471 /* cache callback name lookup
8472 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008473 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008474 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008475
8476 switch (*error_handler) {
8477 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008478 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008479 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008480
8481 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008482 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008483 x = charmapencode_output('?', mapping, res, respos);
8484 if (x==enc_EXCEPTION) {
8485 return -1;
8486 }
8487 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008488 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008489 return -1;
8490 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008491 }
8492 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008493 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008494 *inpos = collendpos;
8495 break;
Victor Stinner50149202015-09-22 00:26:54 +02008496
8497 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008498 /* generate replacement (temporarily (mis)uses p) */
8499 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008500 char buffer[2+29+1+1];
8501 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008502 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 for (cp = buffer; *cp; ++cp) {
8504 x = charmapencode_output(*cp, mapping, res, respos);
8505 if (x==enc_EXCEPTION)
8506 return -1;
8507 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008508 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008509 return -1;
8510 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008511 }
8512 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008513 *inpos = collendpos;
8514 break;
Victor Stinner50149202015-09-22 00:26:54 +02008515
Benjamin Peterson14339b62009-01-31 16:36:08 +00008516 default:
Victor Stinner50149202015-09-22 00:26:54 +02008517 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008518 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008519 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008520 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008521 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008522 if (PyBytes_Check(repunicode)) {
8523 /* Directly copy bytes result to output. */
8524 Py_ssize_t outsize = PyBytes_Size(*res);
8525 Py_ssize_t requiredsize;
8526 repsize = PyBytes_Size(repunicode);
8527 requiredsize = *respos + repsize;
8528 if (requiredsize > outsize)
8529 /* Make room for all additional bytes. */
8530 if (charmapencode_resize(res, respos, requiredsize)) {
8531 Py_DECREF(repunicode);
8532 return -1;
8533 }
8534 memcpy(PyBytes_AsString(*res) + *respos,
8535 PyBytes_AsString(repunicode), repsize);
8536 *respos += repsize;
8537 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008538 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008539 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008540 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008541 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008542 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008543 Py_DECREF(repunicode);
8544 return -1;
8545 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008546 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008547 data = PyUnicode_DATA(repunicode);
8548 kind = PyUnicode_KIND(repunicode);
8549 for (index = 0; index < repsize; index++) {
8550 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8551 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008552 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008553 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008554 return -1;
8555 }
8556 else if (x==enc_FAILED) {
8557 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008558 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008559 return -1;
8560 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008561 }
8562 *inpos = newpos;
8563 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008564 }
8565 return 0;
8566}
8567
Alexander Belopolsky40018472011-02-26 01:02:56 +00008568PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008569_PyUnicode_EncodeCharmap(PyObject *unicode,
8570 PyObject *mapping,
8571 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008572{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008573 /* output object */
8574 PyObject *res = NULL;
8575 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008576 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008577 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008578 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008579 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008580 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008581 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008582 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008583 void *data;
8584 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585
Benjamin Petersonbac79492012-01-14 13:34:47 -05008586 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008587 return NULL;
8588 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008589 data = PyUnicode_DATA(unicode);
8590 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008591
Guido van Rossumd57fd912000-03-10 22:53:23 +00008592 /* Default to Latin-1 */
8593 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008594 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008595
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008596 /* allocate enough for a simple encoding without
8597 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008598 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008599 if (res == NULL)
8600 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008601 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008603
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008604 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008605 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008606 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008607 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008608 if (x==enc_EXCEPTION) /* error */
8609 goto onError;
8610 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008611 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008612 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008613 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008614 &res, &respos)) {
8615 goto onError;
8616 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008617 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008618 else
8619 /* done with this character => adjust input position */
8620 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008621 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008622
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008623 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008624 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008625 if (_PyBytes_Resize(&res, respos) < 0)
8626 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008627
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008628 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008629 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008630 return res;
8631
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008633 Py_XDECREF(res);
8634 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008635 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008636 return NULL;
8637}
8638
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008639/* Deprecated */
8640PyObject *
8641PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8642 Py_ssize_t size,
8643 PyObject *mapping,
8644 const char *errors)
8645{
8646 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008647 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008648 if (unicode == NULL)
8649 return NULL;
8650 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8651 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008652 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008653}
8654
Alexander Belopolsky40018472011-02-26 01:02:56 +00008655PyObject *
8656PyUnicode_AsCharmapString(PyObject *unicode,
8657 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008658{
8659 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008660 PyErr_BadArgument();
8661 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008663 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008664}
8665
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008666/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008667static void
8668make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008669 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008670 Py_ssize_t startpos, Py_ssize_t endpos,
8671 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008672{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008673 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008674 *exceptionObject = _PyUnicodeTranslateError_Create(
8675 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008676 }
8677 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8679 goto onError;
8680 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8681 goto onError;
8682 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8683 goto onError;
8684 return;
8685 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008686 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008687 }
8688}
8689
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008690/* error handling callback helper:
8691 build arguments, call the callback and check the arguments,
8692 put the result into newpos and return the replacement string, which
8693 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008694static PyObject *
8695unicode_translate_call_errorhandler(const char *errors,
8696 PyObject **errorHandler,
8697 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008698 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008699 Py_ssize_t startpos, Py_ssize_t endpos,
8700 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008701{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008702 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008703
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008704 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008705 PyObject *restuple;
8706 PyObject *resunicode;
8707
8708 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008709 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008710 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008711 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008712 }
8713
8714 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008715 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008716 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008718
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008719 restuple = PyObject_CallFunctionObjArgs(
8720 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008721 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008723 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008724 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 Py_DECREF(restuple);
8726 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008727 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008728 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 &resunicode, &i_newpos)) {
8730 Py_DECREF(restuple);
8731 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008732 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008733 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008734 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008735 else
8736 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008737 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008738 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008739 Py_DECREF(restuple);
8740 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008741 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008742 Py_INCREF(resunicode);
8743 Py_DECREF(restuple);
8744 return resunicode;
8745}
8746
8747/* Lookup the character ch in the mapping and put the result in result,
8748 which must be decrefed by the caller.
8749 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008750static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008751charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008752{
Christian Heimes217cfd12007-12-02 14:31:20 +00008753 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008754 PyObject *x;
8755
8756 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008757 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008758 x = PyObject_GetItem(mapping, w);
8759 Py_DECREF(w);
8760 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008761 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8762 /* No mapping found means: use 1:1 mapping. */
8763 PyErr_Clear();
8764 *result = NULL;
8765 return 0;
8766 } else
8767 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008768 }
8769 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008770 *result = x;
8771 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008772 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008773 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008774 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008775 if (value < 0 || value > MAX_UNICODE) {
8776 PyErr_Format(PyExc_ValueError,
8777 "character mapping must be in range(0x%x)",
8778 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008779 Py_DECREF(x);
8780 return -1;
8781 }
8782 *result = x;
8783 return 0;
8784 }
8785 else if (PyUnicode_Check(x)) {
8786 *result = x;
8787 return 0;
8788 }
8789 else {
8790 /* wrong return value */
8791 PyErr_SetString(PyExc_TypeError,
8792 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008793 Py_DECREF(x);
8794 return -1;
8795 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008796}
Victor Stinner1194ea02014-04-04 19:37:40 +02008797
8798/* lookup the character, write the result into the writer.
8799 Return 1 if the result was written into the writer, return 0 if the mapping
8800 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008801static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008802charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8803 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008804{
Victor Stinner1194ea02014-04-04 19:37:40 +02008805 PyObject *item;
8806
8807 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008808 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008809
8810 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008811 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008812 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008813 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008814 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008815 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008816 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008817
8818 if (item == Py_None) {
8819 Py_DECREF(item);
8820 return 0;
8821 }
8822
8823 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008824 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8825 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8826 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008827 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8828 Py_DECREF(item);
8829 return -1;
8830 }
8831 Py_DECREF(item);
8832 return 1;
8833 }
8834
8835 if (!PyUnicode_Check(item)) {
8836 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008837 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008838 }
8839
8840 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8841 Py_DECREF(item);
8842 return -1;
8843 }
8844
8845 Py_DECREF(item);
8846 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008847}
8848
Victor Stinner89a76ab2014-04-05 11:44:04 +02008849static int
8850unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8851 Py_UCS1 *translate)
8852{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008853 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008854 int ret = 0;
8855
Victor Stinner89a76ab2014-04-05 11:44:04 +02008856 if (charmaptranslate_lookup(ch, mapping, &item)) {
8857 return -1;
8858 }
8859
8860 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008861 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008862 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008863 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008864 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008865 /* not found => default to 1:1 mapping */
8866 translate[ch] = ch;
8867 return 1;
8868 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008869 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008870 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008871 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8872 used it */
8873 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008874 /* invalid character or character outside ASCII:
8875 skip the fast translate */
8876 goto exit;
8877 }
8878 translate[ch] = (Py_UCS1)replace;
8879 }
8880 else if (PyUnicode_Check(item)) {
8881 Py_UCS4 replace;
8882
8883 if (PyUnicode_READY(item) == -1) {
8884 Py_DECREF(item);
8885 return -1;
8886 }
8887 if (PyUnicode_GET_LENGTH(item) != 1)
8888 goto exit;
8889
8890 replace = PyUnicode_READ_CHAR(item, 0);
8891 if (replace > 127)
8892 goto exit;
8893 translate[ch] = (Py_UCS1)replace;
8894 }
8895 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008896 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008897 goto exit;
8898 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008899 ret = 1;
8900
Benjamin Peterson1365de72014-04-07 20:15:41 -04008901 exit:
8902 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008903 return ret;
8904}
8905
8906/* Fast path for ascii => ascii translation. Return 1 if the whole string
8907 was translated into writer, return 0 if the input string was partially
8908 translated into writer, raise an exception and return -1 on error. */
8909static int
8910unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008911 _PyUnicodeWriter *writer, int ignore,
8912 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008913{
Victor Stinner872b2912014-04-05 14:27:07 +02008914 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008915 Py_ssize_t len;
8916 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008917 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008918
Victor Stinner89a76ab2014-04-05 11:44:04 +02008919 len = PyUnicode_GET_LENGTH(input);
8920
Victor Stinner872b2912014-04-05 14:27:07 +02008921 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008922
8923 in = PyUnicode_1BYTE_DATA(input);
8924 end = in + len;
8925
8926 assert(PyUnicode_IS_ASCII(writer->buffer));
8927 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8928 out = PyUnicode_1BYTE_DATA(writer->buffer);
8929
Victor Stinner872b2912014-04-05 14:27:07 +02008930 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008931 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008932 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008933 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008934 int translate = unicode_fast_translate_lookup(mapping, ch,
8935 ascii_table);
8936 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008937 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008938 if (translate == 0)
8939 goto exit;
8940 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008941 }
Victor Stinner872b2912014-04-05 14:27:07 +02008942 if (ch2 == 0xfe) {
8943 if (ignore)
8944 continue;
8945 goto exit;
8946 }
8947 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008948 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008949 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008950 }
Victor Stinner872b2912014-04-05 14:27:07 +02008951 res = 1;
8952
8953exit:
8954 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008955 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008956 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008957}
8958
Victor Stinner3222da22015-10-01 22:07:32 +02008959static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008960_PyUnicode_TranslateCharmap(PyObject *input,
8961 PyObject *mapping,
8962 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008963{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008964 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008965 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008966 Py_ssize_t size, i;
8967 int kind;
8968 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008969 _PyUnicodeWriter writer;
8970 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008971 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008972 PyObject *errorHandler = NULL;
8973 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008974 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008975 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008976
Guido van Rossumd57fd912000-03-10 22:53:23 +00008977 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008978 PyErr_BadArgument();
8979 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008980 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008981
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008982 if (PyUnicode_READY(input) == -1)
8983 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008984 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008985 kind = PyUnicode_KIND(input);
8986 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008987
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008988 if (size == 0)
8989 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008990
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008991 /* allocate enough for a simple 1:1 translation without
8992 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008993 _PyUnicodeWriter_Init(&writer);
8994 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008995 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008996
Victor Stinner872b2912014-04-05 14:27:07 +02008997 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8998
Victor Stinner33798672016-03-01 21:59:58 +01008999 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009000 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009001 if (PyUnicode_IS_ASCII(input)) {
9002 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9003 if (res < 0) {
9004 _PyUnicodeWriter_Dealloc(&writer);
9005 return NULL;
9006 }
9007 if (res == 1)
9008 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009009 }
Victor Stinner33798672016-03-01 21:59:58 +01009010 else {
9011 i = 0;
9012 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009013
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009014 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009015 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009016 int translate;
9017 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9018 Py_ssize_t newpos;
9019 /* startpos for collecting untranslatable chars */
9020 Py_ssize_t collstart;
9021 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009022 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009023
Victor Stinner1194ea02014-04-04 19:37:40 +02009024 ch = PyUnicode_READ(kind, data, i);
9025 translate = charmaptranslate_output(ch, mapping, &writer);
9026 if (translate < 0)
9027 goto onError;
9028
9029 if (translate != 0) {
9030 /* it worked => adjust input pointer */
9031 ++i;
9032 continue;
9033 }
9034
9035 /* untranslatable character */
9036 collstart = i;
9037 collend = i+1;
9038
9039 /* find all untranslatable characters */
9040 while (collend < size) {
9041 PyObject *x;
9042 ch = PyUnicode_READ(kind, data, collend);
9043 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009044 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009045 Py_XDECREF(x);
9046 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009047 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009048 ++collend;
9049 }
9050
9051 if (ignore) {
9052 i = collend;
9053 }
9054 else {
9055 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9056 reason, input, &exc,
9057 collstart, collend, &newpos);
9058 if (repunicode == NULL)
9059 goto onError;
9060 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009061 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009062 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009063 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009064 Py_DECREF(repunicode);
9065 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009066 }
9067 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009068 Py_XDECREF(exc);
9069 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009070 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009071
Benjamin Peterson29060642009-01-31 22:14:21 +00009072 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009073 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009074 Py_XDECREF(exc);
9075 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009076 return NULL;
9077}
9078
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009079/* Deprecated. Use PyUnicode_Translate instead. */
9080PyObject *
9081PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9082 Py_ssize_t size,
9083 PyObject *mapping,
9084 const char *errors)
9085{
Christian Heimes5f520f42012-09-11 14:03:25 +02009086 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009087 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009088 if (!unicode)
9089 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009090 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9091 Py_DECREF(unicode);
9092 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009093}
9094
Alexander Belopolsky40018472011-02-26 01:02:56 +00009095PyObject *
9096PyUnicode_Translate(PyObject *str,
9097 PyObject *mapping,
9098 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009099{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009100 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009101 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009102 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009103}
Tim Petersced69f82003-09-16 20:30:58 +00009104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009105PyObject *
9106_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9107{
9108 if (!PyUnicode_Check(unicode)) {
9109 PyErr_BadInternalCall();
9110 return NULL;
9111 }
9112 if (PyUnicode_READY(unicode) == -1)
9113 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009114 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009115 /* If the string is already ASCII, just return the same string */
9116 Py_INCREF(unicode);
9117 return unicode;
9118 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009119
9120 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9121 PyObject *result = PyUnicode_New(len, 127);
9122 if (result == NULL) {
9123 return NULL;
9124 }
9125
9126 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9127 int kind = PyUnicode_KIND(unicode);
9128 const void *data = PyUnicode_DATA(unicode);
9129 Py_ssize_t i;
9130 for (i = 0; i < len; ++i) {
9131 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9132 if (ch < 127) {
9133 out[i] = ch;
9134 }
9135 else if (Py_UNICODE_ISSPACE(ch)) {
9136 out[i] = ' ';
9137 }
9138 else {
9139 int decimal = Py_UNICODE_TODECIMAL(ch);
9140 if (decimal < 0) {
9141 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009142 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009143 _PyUnicode_LENGTH(result) = i + 1;
9144 break;
9145 }
9146 out[i] = '0' + decimal;
9147 }
9148 }
9149
INADA Naoki16dfca42018-07-14 12:06:43 +09009150 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009151 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009152}
9153
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009154PyObject *
9155PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9156 Py_ssize_t length)
9157{
Victor Stinnerf0124502011-11-21 23:12:56 +01009158 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009159 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009160 Py_UCS4 maxchar;
9161 enum PyUnicode_Kind kind;
9162 void *data;
9163
Victor Stinner99d7ad02012-02-22 13:37:39 +01009164 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009165 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009166 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009167 if (ch > 127) {
9168 int decimal = Py_UNICODE_TODECIMAL(ch);
9169 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009170 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009171 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009172 }
9173 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009174
9175 /* Copy to a new string */
9176 decimal = PyUnicode_New(length, maxchar);
9177 if (decimal == NULL)
9178 return decimal;
9179 kind = PyUnicode_KIND(decimal);
9180 data = PyUnicode_DATA(decimal);
9181 /* Iterate over code points */
9182 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009183 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009184 if (ch > 127) {
9185 int decimal = Py_UNICODE_TODECIMAL(ch);
9186 if (decimal >= 0)
9187 ch = '0' + decimal;
9188 }
9189 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009190 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009191 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009192}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009193/* --- Decimal Encoder ---------------------------------------------------- */
9194
Alexander Belopolsky40018472011-02-26 01:02:56 +00009195int
9196PyUnicode_EncodeDecimal(Py_UNICODE *s,
9197 Py_ssize_t length,
9198 char *output,
9199 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009200{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009201 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009202 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009203 enum PyUnicode_Kind kind;
9204 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009205
9206 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009207 PyErr_BadArgument();
9208 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009209 }
9210
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009211 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009212 if (unicode == NULL)
9213 return -1;
9214
Victor Stinner42bf7752011-11-21 22:52:58 +01009215 kind = PyUnicode_KIND(unicode);
9216 data = PyUnicode_DATA(unicode);
9217
Victor Stinnerb84d7232011-11-22 01:50:07 +01009218 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009219 PyObject *exc;
9220 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009221 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009222 Py_ssize_t startpos;
9223
9224 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009225
Benjamin Peterson29060642009-01-31 22:14:21 +00009226 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009227 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009228 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009229 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009230 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009231 decimal = Py_UNICODE_TODECIMAL(ch);
9232 if (decimal >= 0) {
9233 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009234 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009235 continue;
9236 }
9237 if (0 < ch && ch < 256) {
9238 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009239 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009240 continue;
9241 }
Victor Stinner6345be92011-11-25 20:09:01 +01009242
Victor Stinner42bf7752011-11-21 22:52:58 +01009243 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009244 exc = NULL;
9245 raise_encode_exception(&exc, "decimal", unicode,
9246 startpos, startpos+1,
9247 "invalid decimal Unicode string");
9248 Py_XDECREF(exc);
9249 Py_DECREF(unicode);
9250 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009251 }
9252 /* 0-terminate the output string */
9253 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009254 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009255 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009256}
9257
Guido van Rossumd57fd912000-03-10 22:53:23 +00009258/* --- Helpers ------------------------------------------------------------ */
9259
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009260/* helper macro to fixup start/end slice values */
9261#define ADJUST_INDICES(start, end, len) \
9262 if (end > len) \
9263 end = len; \
9264 else if (end < 0) { \
9265 end += len; \
9266 if (end < 0) \
9267 end = 0; \
9268 } \
9269 if (start < 0) { \
9270 start += len; \
9271 if (start < 0) \
9272 start = 0; \
9273 }
9274
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009275static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009276any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009277 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009278 Py_ssize_t end,
9279 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009280{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009281 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009282 void *buf1, *buf2;
9283 Py_ssize_t len1, len2, result;
9284
9285 kind1 = PyUnicode_KIND(s1);
9286 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009287 if (kind1 < kind2)
9288 return -1;
9289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009290 len1 = PyUnicode_GET_LENGTH(s1);
9291 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009292 ADJUST_INDICES(start, end, len1);
9293 if (end - start < len2)
9294 return -1;
9295
9296 buf1 = PyUnicode_DATA(s1);
9297 buf2 = PyUnicode_DATA(s2);
9298 if (len2 == 1) {
9299 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9300 result = findchar((const char *)buf1 + kind1*start,
9301 kind1, end - start, ch, direction);
9302 if (result == -1)
9303 return -1;
9304 else
9305 return start + result;
9306 }
9307
9308 if (kind2 != kind1) {
9309 buf2 = _PyUnicode_AsKind(s2, kind1);
9310 if (!buf2)
9311 return -2;
9312 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009313
Victor Stinner794d5672011-10-10 03:21:36 +02009314 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009315 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009316 case PyUnicode_1BYTE_KIND:
9317 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9318 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9319 else
9320 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9321 break;
9322 case PyUnicode_2BYTE_KIND:
9323 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9324 break;
9325 case PyUnicode_4BYTE_KIND:
9326 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9327 break;
9328 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009329 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009330 }
9331 }
9332 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009333 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009334 case PyUnicode_1BYTE_KIND:
9335 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9336 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9337 else
9338 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9339 break;
9340 case PyUnicode_2BYTE_KIND:
9341 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9342 break;
9343 case PyUnicode_4BYTE_KIND:
9344 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9345 break;
9346 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009347 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009348 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009349 }
9350
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009351 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009352 PyMem_Free(buf2);
9353
9354 return result;
9355}
9356
Victor Stinner59423e32018-11-26 13:40:01 +01009357/* _PyUnicode_InsertThousandsGrouping() helper functions */
9358#include "stringlib/localeutil.h"
9359
9360/**
9361 * InsertThousandsGrouping:
9362 * @writer: Unicode writer.
9363 * @n_buffer: Number of characters in @buffer.
9364 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9365 * @d_pos: Start of digits string.
9366 * @n_digits: The number of digits in the string, in which we want
9367 * to put the grouping chars.
9368 * @min_width: The minimum width of the digits in the output string.
9369 * Output will be zero-padded on the left to fill.
9370 * @grouping: see definition in localeconv().
9371 * @thousands_sep: see definition in localeconv().
9372 *
9373 * There are 2 modes: counting and filling. If @writer is NULL,
9374 * we are in counting mode, else filling mode.
9375 * If counting, the required buffer size is returned.
9376 * If filling, we know the buffer will be large enough, so we don't
9377 * need to pass in the buffer size.
9378 * Inserts thousand grouping characters (as defined by grouping and
9379 * thousands_sep) into @writer.
9380 *
9381 * Return value: -1 on error, number of characters otherwise.
9382 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009383Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009384_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009385 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009386 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009387 PyObject *digits,
9388 Py_ssize_t d_pos,
9389 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009390 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009391 const char *grouping,
9392 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009393 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009394{
Xtreak3f7983a2019-01-07 20:39:14 +05309395 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009396 if (writer) {
9397 assert(digits != NULL);
9398 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009399 }
9400 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009401 assert(digits == NULL);
9402 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009403 }
Victor Stinner59423e32018-11-26 13:40:01 +01009404 assert(0 <= d_pos);
9405 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009406 assert(grouping != NULL);
9407
9408 if (digits != NULL) {
9409 if (PyUnicode_READY(digits) == -1) {
9410 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009411 }
Victor Stinner59423e32018-11-26 13:40:01 +01009412 }
9413 if (PyUnicode_READY(thousands_sep) == -1) {
9414 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009415 }
9416
Victor Stinner59423e32018-11-26 13:40:01 +01009417 Py_ssize_t count = 0;
9418 Py_ssize_t n_zeros;
9419 int loop_broken = 0;
9420 int use_separator = 0; /* First time through, don't append the
9421 separator. They only go between
9422 groups. */
9423 Py_ssize_t buffer_pos;
9424 Py_ssize_t digits_pos;
9425 Py_ssize_t len;
9426 Py_ssize_t n_chars;
9427 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9428 be looked at */
9429 /* A generator that returns all of the grouping widths, until it
9430 returns 0. */
9431 GroupGenerator groupgen;
9432 GroupGenerator_init(&groupgen, grouping);
9433 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9434
9435 /* if digits are not grouped, thousands separator
9436 should be an empty string */
9437 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9438
9439 digits_pos = d_pos + n_digits;
9440 if (writer) {
9441 buffer_pos = writer->pos + n_buffer;
9442 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9443 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009444 }
Victor Stinner59423e32018-11-26 13:40:01 +01009445 else {
9446 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009447 }
Victor Stinner59423e32018-11-26 13:40:01 +01009448
9449 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009450 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009451 }
Victor Stinner59423e32018-11-26 13:40:01 +01009452
9453 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9454 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9455 n_zeros = Py_MAX(0, len - remaining);
9456 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9457
9458 /* Use n_zero zero's and n_chars chars */
9459
9460 /* Count only, don't do anything. */
9461 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9462
9463 /* Copy into the writer. */
9464 InsertThousandsGrouping_fill(writer, &buffer_pos,
9465 digits, &digits_pos,
9466 n_chars, n_zeros,
9467 use_separator ? thousands_sep : NULL,
9468 thousands_sep_len, maxchar);
9469
9470 /* Use a separator next time. */
9471 use_separator = 1;
9472
9473 remaining -= n_chars;
9474 min_width -= len;
9475
9476 if (remaining <= 0 && min_width <= 0) {
9477 loop_broken = 1;
9478 break;
9479 }
9480 min_width -= thousands_sep_len;
9481 }
9482 if (!loop_broken) {
9483 /* We left the loop without using a break statement. */
9484
9485 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9486 n_zeros = Py_MAX(0, len - remaining);
9487 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9488
9489 /* Use n_zero zero's and n_chars chars */
9490 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9491
9492 /* Copy into the writer. */
9493 InsertThousandsGrouping_fill(writer, &buffer_pos,
9494 digits, &digits_pos,
9495 n_chars, n_zeros,
9496 use_separator ? thousands_sep : NULL,
9497 thousands_sep_len, maxchar);
9498 }
9499 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009500}
9501
9502
Alexander Belopolsky40018472011-02-26 01:02:56 +00009503Py_ssize_t
9504PyUnicode_Count(PyObject *str,
9505 PyObject *substr,
9506 Py_ssize_t start,
9507 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009508{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009509 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009510 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009511 void *buf1 = NULL, *buf2 = NULL;
9512 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009513
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009514 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009515 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009516
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009517 kind1 = PyUnicode_KIND(str);
9518 kind2 = PyUnicode_KIND(substr);
9519 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009520 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009521
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009522 len1 = PyUnicode_GET_LENGTH(str);
9523 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009524 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009525 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009526 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009527
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009528 buf1 = PyUnicode_DATA(str);
9529 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009530 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009531 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009532 if (!buf2)
9533 goto onError;
9534 }
9535
9536 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009537 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009538 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009539 result = asciilib_count(
9540 ((Py_UCS1*)buf1) + start, end - start,
9541 buf2, len2, PY_SSIZE_T_MAX
9542 );
9543 else
9544 result = ucs1lib_count(
9545 ((Py_UCS1*)buf1) + start, end - start,
9546 buf2, len2, PY_SSIZE_T_MAX
9547 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009548 break;
9549 case PyUnicode_2BYTE_KIND:
9550 result = ucs2lib_count(
9551 ((Py_UCS2*)buf1) + start, end - start,
9552 buf2, len2, PY_SSIZE_T_MAX
9553 );
9554 break;
9555 case PyUnicode_4BYTE_KIND:
9556 result = ucs4lib_count(
9557 ((Py_UCS4*)buf1) + start, end - start,
9558 buf2, len2, PY_SSIZE_T_MAX
9559 );
9560 break;
9561 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009562 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009563 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009564
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009565 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009566 PyMem_Free(buf2);
9567
Guido van Rossumd57fd912000-03-10 22:53:23 +00009568 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009569 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009570 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009571 PyMem_Free(buf2);
9572 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009573}
9574
Alexander Belopolsky40018472011-02-26 01:02:56 +00009575Py_ssize_t
9576PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009577 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009578 Py_ssize_t start,
9579 Py_ssize_t end,
9580 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009582 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009583 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009584
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009585 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009586}
9587
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009588Py_ssize_t
9589PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9590 Py_ssize_t start, Py_ssize_t end,
9591 int direction)
9592{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009593 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009594 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009595 if (PyUnicode_READY(str) == -1)
9596 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009597 len = PyUnicode_GET_LENGTH(str);
9598 ADJUST_INDICES(start, end, len);
9599 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009600 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009601 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009602 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9603 kind, end-start, ch, direction);
9604 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009605 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009606 else
9607 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009608}
9609
Alexander Belopolsky40018472011-02-26 01:02:56 +00009610static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009611tailmatch(PyObject *self,
9612 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009613 Py_ssize_t start,
9614 Py_ssize_t end,
9615 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009616{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009617 int kind_self;
9618 int kind_sub;
9619 void *data_self;
9620 void *data_sub;
9621 Py_ssize_t offset;
9622 Py_ssize_t i;
9623 Py_ssize_t end_sub;
9624
9625 if (PyUnicode_READY(self) == -1 ||
9626 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009627 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009628
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009629 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9630 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009631 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009632 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009633
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009634 if (PyUnicode_GET_LENGTH(substring) == 0)
9635 return 1;
9636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009637 kind_self = PyUnicode_KIND(self);
9638 data_self = PyUnicode_DATA(self);
9639 kind_sub = PyUnicode_KIND(substring);
9640 data_sub = PyUnicode_DATA(substring);
9641 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9642
9643 if (direction > 0)
9644 offset = end;
9645 else
9646 offset = start;
9647
9648 if (PyUnicode_READ(kind_self, data_self, offset) ==
9649 PyUnicode_READ(kind_sub, data_sub, 0) &&
9650 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9651 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9652 /* If both are of the same kind, memcmp is sufficient */
9653 if (kind_self == kind_sub) {
9654 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009655 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009656 data_sub,
9657 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009658 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009659 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009660 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009661 else {
9662 /* We do not need to compare 0 and len(substring)-1 because
9663 the if statement above ensured already that they are equal
9664 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009665 for (i = 1; i < end_sub; ++i) {
9666 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9667 PyUnicode_READ(kind_sub, data_sub, i))
9668 return 0;
9669 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009670 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009671 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009672 }
9673
9674 return 0;
9675}
9676
Alexander Belopolsky40018472011-02-26 01:02:56 +00009677Py_ssize_t
9678PyUnicode_Tailmatch(PyObject *str,
9679 PyObject *substr,
9680 Py_ssize_t start,
9681 Py_ssize_t end,
9682 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009683{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009684 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009685 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009686
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009687 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009688}
9689
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009690static PyObject *
9691ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009692{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009693 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9694 char *resdata, *data = PyUnicode_DATA(self);
9695 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009696
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009697 res = PyUnicode_New(len, 127);
9698 if (res == NULL)
9699 return NULL;
9700 resdata = PyUnicode_DATA(res);
9701 if (lower)
9702 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009703 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009704 _Py_bytes_upper(resdata, data, len);
9705 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009706}
9707
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009708static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009709handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009710{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009711 Py_ssize_t j;
9712 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009713 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009714 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009715
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009716 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9717
9718 where ! is a negation and \p{xxx} is a character with property xxx.
9719 */
9720 for (j = i - 1; j >= 0; j--) {
9721 c = PyUnicode_READ(kind, data, j);
9722 if (!_PyUnicode_IsCaseIgnorable(c))
9723 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009724 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009725 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9726 if (final_sigma) {
9727 for (j = i + 1; j < length; j++) {
9728 c = PyUnicode_READ(kind, data, j);
9729 if (!_PyUnicode_IsCaseIgnorable(c))
9730 break;
9731 }
9732 final_sigma = j == length || !_PyUnicode_IsCased(c);
9733 }
9734 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009735}
9736
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009737static int
9738lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9739 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009740{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009741 /* Obscure special case. */
9742 if (c == 0x3A3) {
9743 mapped[0] = handle_capital_sigma(kind, data, length, i);
9744 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009745 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009746 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009747}
9748
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009749static Py_ssize_t
9750do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009751{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009752 Py_ssize_t i, k = 0;
9753 int n_res, j;
9754 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009755
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009756 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009757 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009758 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009759 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009760 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009761 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009762 for (i = 1; i < length; i++) {
9763 c = PyUnicode_READ(kind, data, i);
9764 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9765 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009766 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009767 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009768 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009769 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009770 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009771}
9772
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009773static Py_ssize_t
9774do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9775 Py_ssize_t i, k = 0;
9776
9777 for (i = 0; i < length; i++) {
9778 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9779 int n_res, j;
9780 if (Py_UNICODE_ISUPPER(c)) {
9781 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9782 }
9783 else if (Py_UNICODE_ISLOWER(c)) {
9784 n_res = _PyUnicode_ToUpperFull(c, mapped);
9785 }
9786 else {
9787 n_res = 1;
9788 mapped[0] = c;
9789 }
9790 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009791 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009792 res[k++] = mapped[j];
9793 }
9794 }
9795 return k;
9796}
9797
9798static Py_ssize_t
9799do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9800 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009801{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009802 Py_ssize_t i, k = 0;
9803
9804 for (i = 0; i < length; i++) {
9805 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9806 int n_res, j;
9807 if (lower)
9808 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9809 else
9810 n_res = _PyUnicode_ToUpperFull(c, mapped);
9811 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009812 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009813 res[k++] = mapped[j];
9814 }
9815 }
9816 return k;
9817}
9818
9819static Py_ssize_t
9820do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9821{
9822 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9823}
9824
9825static Py_ssize_t
9826do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9827{
9828 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9829}
9830
Benjamin Petersone51757f2012-01-12 21:10:29 -05009831static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009832do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9833{
9834 Py_ssize_t i, k = 0;
9835
9836 for (i = 0; i < length; i++) {
9837 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9838 Py_UCS4 mapped[3];
9839 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9840 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009841 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009842 res[k++] = mapped[j];
9843 }
9844 }
9845 return k;
9846}
9847
9848static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009849do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9850{
9851 Py_ssize_t i, k = 0;
9852 int previous_is_cased;
9853
9854 previous_is_cased = 0;
9855 for (i = 0; i < length; i++) {
9856 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9857 Py_UCS4 mapped[3];
9858 int n_res, j;
9859
9860 if (previous_is_cased)
9861 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9862 else
9863 n_res = _PyUnicode_ToTitleFull(c, mapped);
9864
9865 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009866 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009867 res[k++] = mapped[j];
9868 }
9869
9870 previous_is_cased = _PyUnicode_IsCased(c);
9871 }
9872 return k;
9873}
9874
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009875static PyObject *
9876case_operation(PyObject *self,
9877 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9878{
9879 PyObject *res = NULL;
9880 Py_ssize_t length, newlength = 0;
9881 int kind, outkind;
9882 void *data, *outdata;
9883 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9884
Benjamin Petersoneea48462012-01-16 14:28:50 -05009885 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009886
9887 kind = PyUnicode_KIND(self);
9888 data = PyUnicode_DATA(self);
9889 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009890 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009891 PyErr_SetString(PyExc_OverflowError, "string is too long");
9892 return NULL;
9893 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009894 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009895 if (tmp == NULL)
9896 return PyErr_NoMemory();
9897 newlength = perform(kind, data, length, tmp, &maxchar);
9898 res = PyUnicode_New(newlength, maxchar);
9899 if (res == NULL)
9900 goto leave;
9901 tmpend = tmp + newlength;
9902 outdata = PyUnicode_DATA(res);
9903 outkind = PyUnicode_KIND(res);
9904 switch (outkind) {
9905 case PyUnicode_1BYTE_KIND:
9906 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9907 break;
9908 case PyUnicode_2BYTE_KIND:
9909 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9910 break;
9911 case PyUnicode_4BYTE_KIND:
9912 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9913 break;
9914 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009915 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009916 }
9917 leave:
9918 PyMem_FREE(tmp);
9919 return res;
9920}
9921
Tim Peters8ce9f162004-08-27 01:49:32 +00009922PyObject *
9923PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009924{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009925 PyObject *res;
9926 PyObject *fseq;
9927 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009928 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009929
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009930 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009931 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009932 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009933 }
9934
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009935 /* NOTE: the following code can't call back into Python code,
9936 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009937 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009938
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009939 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009940 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009941 res = _PyUnicode_JoinArray(separator, items, seqlen);
9942 Py_DECREF(fseq);
9943 return res;
9944}
9945
9946PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +02009947_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009948{
9949 PyObject *res = NULL; /* the result */
9950 PyObject *sep = NULL;
9951 Py_ssize_t seplen;
9952 PyObject *item;
9953 Py_ssize_t sz, i, res_offset;
9954 Py_UCS4 maxchar;
9955 Py_UCS4 item_maxchar;
9956 int use_memcpy;
9957 unsigned char *res_data = NULL, *sep_data = NULL;
9958 PyObject *last_obj;
9959 unsigned int kind = 0;
9960
Tim Peters05eba1f2004-08-27 21:32:02 +00009961 /* If empty sequence, return u"". */
9962 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009963 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009964 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009965
Tim Peters05eba1f2004-08-27 21:32:02 +00009966 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009967 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009968 if (seqlen == 1) {
9969 if (PyUnicode_CheckExact(items[0])) {
9970 res = items[0];
9971 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009972 return res;
9973 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009974 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009975 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009976 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009977 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009978 /* Set up sep and seplen */
9979 if (separator == NULL) {
9980 /* fall back to a blank space separator */
9981 sep = PyUnicode_FromOrdinal(' ');
9982 if (!sep)
9983 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009984 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009985 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009986 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009987 else {
9988 if (!PyUnicode_Check(separator)) {
9989 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02009990 "separator: expected str instance,"
9991 " %.80s found",
9992 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +02009993 goto onError;
9994 }
9995 if (PyUnicode_READY(separator))
9996 goto onError;
9997 sep = separator;
9998 seplen = PyUnicode_GET_LENGTH(separator);
9999 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10000 /* inc refcount to keep this code path symmetric with the
10001 above case of a blank separator */
10002 Py_INCREF(sep);
10003 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010004 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010005 }
10006
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010007 /* There are at least two things to join, or else we have a subclass
10008 * of str in the sequence.
10009 * Do a pre-pass to figure out the total amount of space we'll
10010 * need (sz), and see whether all argument are strings.
10011 */
10012 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010013#ifdef Py_DEBUG
10014 use_memcpy = 0;
10015#else
10016 use_memcpy = 1;
10017#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010018 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010019 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010020 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010021 if (!PyUnicode_Check(item)) {
10022 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010023 "sequence item %zd: expected str instance,"
10024 " %.80s found",
10025 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010026 goto onError;
10027 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010028 if (PyUnicode_READY(item) == -1)
10029 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010030 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010031 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010032 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010033 if (i != 0) {
10034 add_sz += seplen;
10035 }
10036 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010037 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010038 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010039 goto onError;
10040 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010041 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010042 if (use_memcpy && last_obj != NULL) {
10043 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10044 use_memcpy = 0;
10045 }
10046 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010047 }
Tim Petersced69f82003-09-16 20:30:58 +000010048
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010049 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010050 if (res == NULL)
10051 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010052
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010053 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010054#ifdef Py_DEBUG
10055 use_memcpy = 0;
10056#else
10057 if (use_memcpy) {
10058 res_data = PyUnicode_1BYTE_DATA(res);
10059 kind = PyUnicode_KIND(res);
10060 if (seplen != 0)
10061 sep_data = PyUnicode_1BYTE_DATA(sep);
10062 }
10063#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010064 if (use_memcpy) {
10065 for (i = 0; i < seqlen; ++i) {
10066 Py_ssize_t itemlen;
10067 item = items[i];
10068
10069 /* Copy item, and maybe the separator. */
10070 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010071 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010072 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010073 kind * seplen);
10074 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010075 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010076
10077 itemlen = PyUnicode_GET_LENGTH(item);
10078 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010079 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010080 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010081 kind * itemlen);
10082 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010083 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010084 }
10085 assert(res_data == PyUnicode_1BYTE_DATA(res)
10086 + kind * PyUnicode_GET_LENGTH(res));
10087 }
10088 else {
10089 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10090 Py_ssize_t itemlen;
10091 item = items[i];
10092
10093 /* Copy item, and maybe the separator. */
10094 if (i && seplen != 0) {
10095 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10096 res_offset += seplen;
10097 }
10098
10099 itemlen = PyUnicode_GET_LENGTH(item);
10100 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010101 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010102 res_offset += itemlen;
10103 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010104 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010105 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010106 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010108 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010109 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010110 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010111
Benjamin Peterson29060642009-01-31 22:14:21 +000010112 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010113 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010114 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010115 return NULL;
10116}
10117
Victor Stinnerd3f08822012-05-29 12:57:52 +020010118void
10119_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10120 Py_UCS4 fill_char)
10121{
10122 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010123 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010124 assert(PyUnicode_IS_READY(unicode));
10125 assert(unicode_modifiable(unicode));
10126 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10127 assert(start >= 0);
10128 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010129 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010130}
10131
Victor Stinner3fe55312012-01-04 00:33:50 +010010132Py_ssize_t
10133PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10134 Py_UCS4 fill_char)
10135{
10136 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010137
10138 if (!PyUnicode_Check(unicode)) {
10139 PyErr_BadInternalCall();
10140 return -1;
10141 }
10142 if (PyUnicode_READY(unicode) == -1)
10143 return -1;
10144 if (unicode_check_modifiable(unicode))
10145 return -1;
10146
Victor Stinnerd3f08822012-05-29 12:57:52 +020010147 if (start < 0) {
10148 PyErr_SetString(PyExc_IndexError, "string index out of range");
10149 return -1;
10150 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010151 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10152 PyErr_SetString(PyExc_ValueError,
10153 "fill character is bigger than "
10154 "the string maximum character");
10155 return -1;
10156 }
10157
10158 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10159 length = Py_MIN(maxlen, length);
10160 if (length <= 0)
10161 return 0;
10162
Victor Stinnerd3f08822012-05-29 12:57:52 +020010163 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010164 return length;
10165}
10166
Victor Stinner9310abb2011-10-05 00:59:23 +020010167static PyObject *
10168pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010169 Py_ssize_t left,
10170 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010171 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010172{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010173 PyObject *u;
10174 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010175 int kind;
10176 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010177
10178 if (left < 0)
10179 left = 0;
10180 if (right < 0)
10181 right = 0;
10182
Victor Stinnerc4b49542011-12-11 22:44:26 +010010183 if (left == 0 && right == 0)
10184 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010186 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10187 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010188 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10189 return NULL;
10190 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010191 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010192 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010193 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010194 if (!u)
10195 return NULL;
10196
10197 kind = PyUnicode_KIND(u);
10198 data = PyUnicode_DATA(u);
10199 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010200 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010201 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010202 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010203 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010204 assert(_PyUnicode_CheckConsistency(u, 1));
10205 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010206}
10207
Alexander Belopolsky40018472011-02-26 01:02:56 +000010208PyObject *
10209PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010210{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010211 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010212
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010213 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010214 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010215
Benjamin Petersonead6b532011-12-20 17:23:42 -060010216 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010217 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010218 if (PyUnicode_IS_ASCII(string))
10219 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010220 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010221 PyUnicode_GET_LENGTH(string), keepends);
10222 else
10223 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010224 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010225 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010226 break;
10227 case PyUnicode_2BYTE_KIND:
10228 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010229 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010230 PyUnicode_GET_LENGTH(string), keepends);
10231 break;
10232 case PyUnicode_4BYTE_KIND:
10233 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010234 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010235 PyUnicode_GET_LENGTH(string), keepends);
10236 break;
10237 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010238 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010239 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010240 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010241}
10242
Alexander Belopolsky40018472011-02-26 01:02:56 +000010243static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010244split(PyObject *self,
10245 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010246 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010247{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010248 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010249 void *buf1, *buf2;
10250 Py_ssize_t len1, len2;
10251 PyObject* out;
10252
Guido van Rossumd57fd912000-03-10 22:53:23 +000010253 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010254 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010255
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256 if (PyUnicode_READY(self) == -1)
10257 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010259 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010260 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010261 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010262 if (PyUnicode_IS_ASCII(self))
10263 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010264 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010265 PyUnicode_GET_LENGTH(self), maxcount
10266 );
10267 else
10268 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010269 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010270 PyUnicode_GET_LENGTH(self), maxcount
10271 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010272 case PyUnicode_2BYTE_KIND:
10273 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010274 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010275 PyUnicode_GET_LENGTH(self), maxcount
10276 );
10277 case PyUnicode_4BYTE_KIND:
10278 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010279 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 PyUnicode_GET_LENGTH(self), maxcount
10281 );
10282 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010283 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 }
10285
10286 if (PyUnicode_READY(substring) == -1)
10287 return NULL;
10288
10289 kind1 = PyUnicode_KIND(self);
10290 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010291 len1 = PyUnicode_GET_LENGTH(self);
10292 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010293 if (kind1 < kind2 || len1 < len2) {
10294 out = PyList_New(1);
10295 if (out == NULL)
10296 return NULL;
10297 Py_INCREF(self);
10298 PyList_SET_ITEM(out, 0, self);
10299 return out;
10300 }
10301 buf1 = PyUnicode_DATA(self);
10302 buf2 = PyUnicode_DATA(substring);
10303 if (kind2 != kind1) {
10304 buf2 = _PyUnicode_AsKind(substring, kind1);
10305 if (!buf2)
10306 return NULL;
10307 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010308
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010309 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010311 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10312 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010313 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010314 else
10315 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010316 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010317 break;
10318 case PyUnicode_2BYTE_KIND:
10319 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010320 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010321 break;
10322 case PyUnicode_4BYTE_KIND:
10323 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010324 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010325 break;
10326 default:
10327 out = NULL;
10328 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010329 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010330 PyMem_Free(buf2);
10331 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010332}
10333
Alexander Belopolsky40018472011-02-26 01:02:56 +000010334static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010335rsplit(PyObject *self,
10336 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010337 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010338{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010339 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010340 void *buf1, *buf2;
10341 Py_ssize_t len1, len2;
10342 PyObject* out;
10343
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010344 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010345 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010346
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 if (PyUnicode_READY(self) == -1)
10348 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010350 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010351 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010352 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010353 if (PyUnicode_IS_ASCII(self))
10354 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010355 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010356 PyUnicode_GET_LENGTH(self), maxcount
10357 );
10358 else
10359 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010360 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010361 PyUnicode_GET_LENGTH(self), maxcount
10362 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010363 case PyUnicode_2BYTE_KIND:
10364 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010365 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 PyUnicode_GET_LENGTH(self), maxcount
10367 );
10368 case PyUnicode_4BYTE_KIND:
10369 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010370 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 PyUnicode_GET_LENGTH(self), maxcount
10372 );
10373 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010374 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 }
10376
10377 if (PyUnicode_READY(substring) == -1)
10378 return NULL;
10379
10380 kind1 = PyUnicode_KIND(self);
10381 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010382 len1 = PyUnicode_GET_LENGTH(self);
10383 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010384 if (kind1 < kind2 || len1 < len2) {
10385 out = PyList_New(1);
10386 if (out == NULL)
10387 return NULL;
10388 Py_INCREF(self);
10389 PyList_SET_ITEM(out, 0, self);
10390 return out;
10391 }
10392 buf1 = PyUnicode_DATA(self);
10393 buf2 = PyUnicode_DATA(substring);
10394 if (kind2 != kind1) {
10395 buf2 = _PyUnicode_AsKind(substring, kind1);
10396 if (!buf2)
10397 return NULL;
10398 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010399
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010400 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010401 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010402 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10403 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010404 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010405 else
10406 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010407 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010408 break;
10409 case PyUnicode_2BYTE_KIND:
10410 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010411 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010412 break;
10413 case PyUnicode_4BYTE_KIND:
10414 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010415 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 break;
10417 default:
10418 out = NULL;
10419 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010420 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010421 PyMem_Free(buf2);
10422 return out;
10423}
10424
10425static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010426anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10427 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010428{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010429 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010431 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10432 return asciilib_find(buf1, len1, buf2, len2, offset);
10433 else
10434 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010435 case PyUnicode_2BYTE_KIND:
10436 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10437 case PyUnicode_4BYTE_KIND:
10438 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10439 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010440 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441}
10442
10443static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010444anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10445 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010446{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010447 switch (kind) {
10448 case PyUnicode_1BYTE_KIND:
10449 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10450 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10451 else
10452 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10453 case PyUnicode_2BYTE_KIND:
10454 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10455 case PyUnicode_4BYTE_KIND:
10456 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10457 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010458 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010459}
10460
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010461static void
10462replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10463 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10464{
10465 int kind = PyUnicode_KIND(u);
10466 void *data = PyUnicode_DATA(u);
10467 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10468 if (kind == PyUnicode_1BYTE_KIND) {
10469 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10470 (Py_UCS1 *)data + len,
10471 u1, u2, maxcount);
10472 }
10473 else if (kind == PyUnicode_2BYTE_KIND) {
10474 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10475 (Py_UCS2 *)data + len,
10476 u1, u2, maxcount);
10477 }
10478 else {
10479 assert(kind == PyUnicode_4BYTE_KIND);
10480 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10481 (Py_UCS4 *)data + len,
10482 u1, u2, maxcount);
10483 }
10484}
10485
Alexander Belopolsky40018472011-02-26 01:02:56 +000010486static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487replace(PyObject *self, PyObject *str1,
10488 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010489{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490 PyObject *u;
10491 char *sbuf = PyUnicode_DATA(self);
10492 char *buf1 = PyUnicode_DATA(str1);
10493 char *buf2 = PyUnicode_DATA(str2);
10494 int srelease = 0, release1 = 0, release2 = 0;
10495 int skind = PyUnicode_KIND(self);
10496 int kind1 = PyUnicode_KIND(str1);
10497 int kind2 = PyUnicode_KIND(str2);
10498 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10499 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10500 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010501 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010502 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010503
10504 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010505 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010507 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010508
Victor Stinner59de0ee2011-10-07 10:01:28 +020010509 if (str1 == str2)
10510 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511
Victor Stinner49a0a212011-10-12 23:46:10 +020010512 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010513 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10514 if (maxchar < maxchar_str1)
10515 /* substring too wide to be present */
10516 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010517 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10518 /* Replacing str1 with str2 may cause a maxchar reduction in the
10519 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010520 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010521 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010522
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010524 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010525 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010526 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010527 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010528 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010529 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010530 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010531
Victor Stinner69ed0f42013-04-09 21:48:24 +020010532 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010533 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010534 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010535 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010536 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010538 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010539 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010540
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010541 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10542 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010543 }
10544 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010545 int rkind = skind;
10546 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010547 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010548
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010549 if (kind1 < rkind) {
10550 /* widen substring */
10551 buf1 = _PyUnicode_AsKind(str1, rkind);
10552 if (!buf1) goto error;
10553 release1 = 1;
10554 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010555 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010556 if (i < 0)
10557 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 if (rkind > kind2) {
10559 /* widen replacement */
10560 buf2 = _PyUnicode_AsKind(str2, rkind);
10561 if (!buf2) goto error;
10562 release2 = 1;
10563 }
10564 else if (rkind < kind2) {
10565 /* widen self and buf1 */
10566 rkind = kind2;
10567 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010568 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010569 sbuf = _PyUnicode_AsKind(self, rkind);
10570 if (!sbuf) goto error;
10571 srelease = 1;
10572 buf1 = _PyUnicode_AsKind(str1, rkind);
10573 if (!buf1) goto error;
10574 release1 = 1;
10575 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010576 u = PyUnicode_New(slen, maxchar);
10577 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010578 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010579 assert(PyUnicode_KIND(u) == rkind);
10580 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010581
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010582 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010583 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010584 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010586 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010588
10589 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010590 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010591 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010592 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010593 if (i == -1)
10594 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010595 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010597 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010599 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010600 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010601 }
10602 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010604 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 int rkind = skind;
10606 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010607
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010609 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610 buf1 = _PyUnicode_AsKind(str1, rkind);
10611 if (!buf1) goto error;
10612 release1 = 1;
10613 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010614 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010615 if (n == 0)
10616 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010618 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 buf2 = _PyUnicode_AsKind(str2, rkind);
10620 if (!buf2) goto error;
10621 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010622 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010623 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010624 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010625 rkind = kind2;
10626 sbuf = _PyUnicode_AsKind(self, rkind);
10627 if (!sbuf) goto error;
10628 srelease = 1;
10629 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010630 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010631 buf1 = _PyUnicode_AsKind(str1, rkind);
10632 if (!buf1) goto error;
10633 release1 = 1;
10634 }
10635 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10636 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010637 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638 PyErr_SetString(PyExc_OverflowError,
10639 "replace string is too long");
10640 goto error;
10641 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010642 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010643 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010644 _Py_INCREF_UNICODE_EMPTY();
10645 if (!unicode_empty)
10646 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010647 u = unicode_empty;
10648 goto done;
10649 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010650 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 PyErr_SetString(PyExc_OverflowError,
10652 "replace string is too long");
10653 goto error;
10654 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010655 u = PyUnicode_New(new_size, maxchar);
10656 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010658 assert(PyUnicode_KIND(u) == rkind);
10659 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 ires = i = 0;
10661 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010662 while (n-- > 0) {
10663 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010664 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010665 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010666 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010667 if (j == -1)
10668 break;
10669 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010670 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010671 memcpy(res + rkind * ires,
10672 sbuf + rkind * i,
10673 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010675 }
10676 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010677 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010678 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010679 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010680 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010682 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010683 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010684 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010686 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010687 memcpy(res + rkind * ires,
10688 sbuf + rkind * i,
10689 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010690 }
10691 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010692 /* interleave */
10693 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010694 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010695 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010696 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010697 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010698 if (--n <= 0)
10699 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010700 memcpy(res + rkind * ires,
10701 sbuf + rkind * i,
10702 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010703 ires++;
10704 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010705 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010706 memcpy(res + rkind * ires,
10707 sbuf + rkind * i,
10708 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010709 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010710 }
10711
10712 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010713 unicode_adjust_maxchar(&u);
10714 if (u == NULL)
10715 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010716 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010717
10718 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 if (srelease)
10720 PyMem_FREE(sbuf);
10721 if (release1)
10722 PyMem_FREE(buf1);
10723 if (release2)
10724 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010725 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010727
Benjamin Peterson29060642009-01-31 22:14:21 +000010728 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010729 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010730 if (srelease)
10731 PyMem_FREE(sbuf);
10732 if (release1)
10733 PyMem_FREE(buf1);
10734 if (release2)
10735 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010736 return unicode_result_unchanged(self);
10737
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010738 error:
10739 if (srelease && sbuf)
10740 PyMem_FREE(sbuf);
10741 if (release1 && buf1)
10742 PyMem_FREE(buf1);
10743 if (release2 && buf2)
10744 PyMem_FREE(buf2);
10745 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010746}
10747
10748/* --- Unicode Object Methods --------------------------------------------- */
10749
INADA Naoki3ae20562017-01-16 20:41:20 +090010750/*[clinic input]
10751str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010752
INADA Naoki3ae20562017-01-16 20:41:20 +090010753Return a version of the string where each word is titlecased.
10754
10755More specifically, words start with uppercased characters and all remaining
10756cased characters have lower case.
10757[clinic start generated code]*/
10758
10759static PyObject *
10760unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010761/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010762{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010763 if (PyUnicode_READY(self) == -1)
10764 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010765 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010766}
10767
INADA Naoki3ae20562017-01-16 20:41:20 +090010768/*[clinic input]
10769str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010770
INADA Naoki3ae20562017-01-16 20:41:20 +090010771Return a capitalized version of the string.
10772
10773More specifically, make the first character have upper case and the rest lower
10774case.
10775[clinic start generated code]*/
10776
10777static PyObject *
10778unicode_capitalize_impl(PyObject *self)
10779/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010780{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010781 if (PyUnicode_READY(self) == -1)
10782 return NULL;
10783 if (PyUnicode_GET_LENGTH(self) == 0)
10784 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010785 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010786}
10787
INADA Naoki3ae20562017-01-16 20:41:20 +090010788/*[clinic input]
10789str.casefold as unicode_casefold
10790
10791Return a version of the string suitable for caseless comparisons.
10792[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010793
10794static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010795unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010796/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010797{
10798 if (PyUnicode_READY(self) == -1)
10799 return NULL;
10800 if (PyUnicode_IS_ASCII(self))
10801 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010802 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010803}
10804
10805
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010806/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010807
10808static int
10809convert_uc(PyObject *obj, void *addr)
10810{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010811 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010812
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010813 if (!PyUnicode_Check(obj)) {
10814 PyErr_Format(PyExc_TypeError,
10815 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010816 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010817 return 0;
10818 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010819 if (PyUnicode_READY(obj) < 0)
10820 return 0;
10821 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010822 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010823 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010824 return 0;
10825 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010826 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010827 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010828}
10829
INADA Naoki3ae20562017-01-16 20:41:20 +090010830/*[clinic input]
10831str.center as unicode_center
10832
10833 width: Py_ssize_t
10834 fillchar: Py_UCS4 = ' '
10835 /
10836
10837Return a centered string of length width.
10838
10839Padding is done using the specified fill character (default is a space).
10840[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010841
10842static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010843unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10844/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010845{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010846 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010847
Benjamin Petersonbac79492012-01-14 13:34:47 -050010848 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010849 return NULL;
10850
Victor Stinnerc4b49542011-12-11 22:44:26 +010010851 if (PyUnicode_GET_LENGTH(self) >= width)
10852 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010853
Victor Stinnerc4b49542011-12-11 22:44:26 +010010854 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010855 left = marg / 2 + (marg & width & 1);
10856
Victor Stinner9310abb2011-10-05 00:59:23 +020010857 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010858}
10859
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010860/* This function assumes that str1 and str2 are readied by the caller. */
10861
Marc-André Lemburge5034372000-08-08 08:04:29 +000010862static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010863unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010864{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010865#define COMPARE(TYPE1, TYPE2) \
10866 do { \
10867 TYPE1* p1 = (TYPE1 *)data1; \
10868 TYPE2* p2 = (TYPE2 *)data2; \
10869 TYPE1* end = p1 + len; \
10870 Py_UCS4 c1, c2; \
10871 for (; p1 != end; p1++, p2++) { \
10872 c1 = *p1; \
10873 c2 = *p2; \
10874 if (c1 != c2) \
10875 return (c1 < c2) ? -1 : 1; \
10876 } \
10877 } \
10878 while (0)
10879
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010880 int kind1, kind2;
10881 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010882 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010883
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010884 kind1 = PyUnicode_KIND(str1);
10885 kind2 = PyUnicode_KIND(str2);
10886 data1 = PyUnicode_DATA(str1);
10887 data2 = PyUnicode_DATA(str2);
10888 len1 = PyUnicode_GET_LENGTH(str1);
10889 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010890 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010891
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010892 switch(kind1) {
10893 case PyUnicode_1BYTE_KIND:
10894 {
10895 switch(kind2) {
10896 case PyUnicode_1BYTE_KIND:
10897 {
10898 int cmp = memcmp(data1, data2, len);
10899 /* normalize result of memcmp() into the range [-1; 1] */
10900 if (cmp < 0)
10901 return -1;
10902 if (cmp > 0)
10903 return 1;
10904 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010905 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010906 case PyUnicode_2BYTE_KIND:
10907 COMPARE(Py_UCS1, Py_UCS2);
10908 break;
10909 case PyUnicode_4BYTE_KIND:
10910 COMPARE(Py_UCS1, Py_UCS4);
10911 break;
10912 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010913 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010914 }
10915 break;
10916 }
10917 case PyUnicode_2BYTE_KIND:
10918 {
10919 switch(kind2) {
10920 case PyUnicode_1BYTE_KIND:
10921 COMPARE(Py_UCS2, Py_UCS1);
10922 break;
10923 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010924 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010925 COMPARE(Py_UCS2, Py_UCS2);
10926 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010927 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010928 case PyUnicode_4BYTE_KIND:
10929 COMPARE(Py_UCS2, Py_UCS4);
10930 break;
10931 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010932 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010933 }
10934 break;
10935 }
10936 case PyUnicode_4BYTE_KIND:
10937 {
10938 switch(kind2) {
10939 case PyUnicode_1BYTE_KIND:
10940 COMPARE(Py_UCS4, Py_UCS1);
10941 break;
10942 case PyUnicode_2BYTE_KIND:
10943 COMPARE(Py_UCS4, Py_UCS2);
10944 break;
10945 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010946 {
10947#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10948 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10949 /* normalize result of wmemcmp() into the range [-1; 1] */
10950 if (cmp < 0)
10951 return -1;
10952 if (cmp > 0)
10953 return 1;
10954#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010955 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010956#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010957 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010958 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010959 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010960 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010961 }
10962 break;
10963 }
10964 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010965 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000010966 }
10967
Victor Stinner770e19e2012-10-04 22:59:45 +020010968 if (len1 == len2)
10969 return 0;
10970 if (len1 < len2)
10971 return -1;
10972 else
10973 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010974
10975#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010976}
10977
Benjamin Peterson621b4302016-09-09 13:54:34 -070010978static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010979unicode_compare_eq(PyObject *str1, PyObject *str2)
10980{
10981 int kind;
10982 void *data1, *data2;
10983 Py_ssize_t len;
10984 int cmp;
10985
Victor Stinnere5567ad2012-10-23 02:48:49 +020010986 len = PyUnicode_GET_LENGTH(str1);
10987 if (PyUnicode_GET_LENGTH(str2) != len)
10988 return 0;
10989 kind = PyUnicode_KIND(str1);
10990 if (PyUnicode_KIND(str2) != kind)
10991 return 0;
10992 data1 = PyUnicode_DATA(str1);
10993 data2 = PyUnicode_DATA(str2);
10994
10995 cmp = memcmp(data1, data2, len * kind);
10996 return (cmp == 0);
10997}
10998
10999
Alexander Belopolsky40018472011-02-26 01:02:56 +000011000int
11001PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011002{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011003 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11004 if (PyUnicode_READY(left) == -1 ||
11005 PyUnicode_READY(right) == -1)
11006 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011007
11008 /* a string is equal to itself */
11009 if (left == right)
11010 return 0;
11011
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011012 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011013 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011014 PyErr_Format(PyExc_TypeError,
11015 "Can't compare %.100s and %.100s",
11016 left->ob_type->tp_name,
11017 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011018 return -1;
11019}
11020
Martin v. Löwis5b222132007-06-10 09:51:05 +000011021int
11022PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11023{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011024 Py_ssize_t i;
11025 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011026 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011027 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011028
Victor Stinner910337b2011-10-03 03:20:16 +020011029 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011030 if (!PyUnicode_IS_READY(uni)) {
11031 const wchar_t *ws = _PyUnicode_WSTR(uni);
11032 /* Compare Unicode string and source character set string */
11033 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11034 if (chr != ustr[i])
11035 return (chr < ustr[i]) ? -1 : 1;
11036 }
11037 /* This check keeps Python strings that end in '\0' from comparing equal
11038 to C strings identical up to that point. */
11039 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11040 return 1; /* uni is longer */
11041 if (ustr[i])
11042 return -1; /* str is longer */
11043 return 0;
11044 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011045 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011046 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011047 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011048 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011049 size_t len, len2 = strlen(str);
11050 int cmp;
11051
11052 len = Py_MIN(len1, len2);
11053 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011054 if (cmp != 0) {
11055 if (cmp < 0)
11056 return -1;
11057 else
11058 return 1;
11059 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011060 if (len1 > len2)
11061 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011062 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011063 return -1; /* str is longer */
11064 return 0;
11065 }
11066 else {
11067 void *data = PyUnicode_DATA(uni);
11068 /* Compare Unicode string and source character set string */
11069 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011070 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011071 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11072 /* This check keeps Python strings that end in '\0' from comparing equal
11073 to C strings identical up to that point. */
11074 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11075 return 1; /* uni is longer */
11076 if (str[i])
11077 return -1; /* str is longer */
11078 return 0;
11079 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011080}
11081
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011082static int
11083non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11084{
11085 size_t i, len;
11086 const wchar_t *p;
11087 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11088 if (strlen(str) != len)
11089 return 0;
11090 p = _PyUnicode_WSTR(unicode);
11091 assert(p);
11092 for (i = 0; i < len; i++) {
11093 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011094 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011095 return 0;
11096 }
11097 return 1;
11098}
11099
11100int
11101_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11102{
11103 size_t len;
11104 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011105 assert(str);
11106#ifndef NDEBUG
11107 for (const char *p = str; *p; p++) {
11108 assert((unsigned char)*p < 128);
11109 }
11110#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011111 if (PyUnicode_READY(unicode) == -1) {
11112 /* Memory error or bad data */
11113 PyErr_Clear();
11114 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11115 }
11116 if (!PyUnicode_IS_ASCII(unicode))
11117 return 0;
11118 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11119 return strlen(str) == len &&
11120 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11121}
11122
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011123int
11124_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11125{
11126 PyObject *right_uni;
11127 Py_hash_t hash;
11128
11129 assert(_PyUnicode_CHECK(left));
11130 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011131#ifndef NDEBUG
11132 for (const char *p = right->string; *p; p++) {
11133 assert((unsigned char)*p < 128);
11134 }
11135#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011136
11137 if (PyUnicode_READY(left) == -1) {
11138 /* memory error or bad data */
11139 PyErr_Clear();
11140 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11141 }
11142
11143 if (!PyUnicode_IS_ASCII(left))
11144 return 0;
11145
11146 right_uni = _PyUnicode_FromId(right); /* borrowed */
11147 if (right_uni == NULL) {
11148 /* memory error or bad data */
11149 PyErr_Clear();
11150 return _PyUnicode_EqualToASCIIString(left, right->string);
11151 }
11152
11153 if (left == right_uni)
11154 return 1;
11155
11156 if (PyUnicode_CHECK_INTERNED(left))
11157 return 0;
11158
INADA Naoki7cc95f52018-01-28 02:07:09 +090011159 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011160 hash = _PyUnicode_HASH(left);
11161 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11162 return 0;
11163
11164 return unicode_compare_eq(left, right_uni);
11165}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011166
Alexander Belopolsky40018472011-02-26 01:02:56 +000011167PyObject *
11168PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011169{
11170 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011171
Victor Stinnere5567ad2012-10-23 02:48:49 +020011172 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11173 Py_RETURN_NOTIMPLEMENTED;
11174
11175 if (PyUnicode_READY(left) == -1 ||
11176 PyUnicode_READY(right) == -1)
11177 return NULL;
11178
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011179 if (left == right) {
11180 switch (op) {
11181 case Py_EQ:
11182 case Py_LE:
11183 case Py_GE:
11184 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011185 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011186 case Py_NE:
11187 case Py_LT:
11188 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011189 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011190 default:
11191 PyErr_BadArgument();
11192 return NULL;
11193 }
11194 }
11195 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011196 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011197 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011198 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011199 }
11200 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011201 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011202 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011203 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011204}
11205
Alexander Belopolsky40018472011-02-26 01:02:56 +000011206int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011207_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11208{
11209 return unicode_eq(aa, bb);
11210}
11211
11212int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011213PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011214{
Victor Stinner77282cb2013-04-14 19:22:47 +020011215 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011216 void *buf1, *buf2;
11217 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011218 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011219
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011220 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011221 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011222 "'in <string>' requires string as left operand, not %.100s",
11223 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011224 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011225 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011226 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011227 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011228 if (ensure_unicode(str) < 0)
11229 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011231 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011232 kind2 = PyUnicode_KIND(substr);
11233 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011234 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011235 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011236 len2 = PyUnicode_GET_LENGTH(substr);
11237 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011238 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011239 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011240 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011241 if (len2 == 1) {
11242 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11243 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011244 return result;
11245 }
11246 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011247 buf2 = _PyUnicode_AsKind(substr, kind1);
11248 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011249 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011250 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011251
Victor Stinner77282cb2013-04-14 19:22:47 +020011252 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011253 case PyUnicode_1BYTE_KIND:
11254 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11255 break;
11256 case PyUnicode_2BYTE_KIND:
11257 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11258 break;
11259 case PyUnicode_4BYTE_KIND:
11260 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11261 break;
11262 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011263 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011264 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011265
Victor Stinner77282cb2013-04-14 19:22:47 +020011266 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011267 PyMem_Free(buf2);
11268
Guido van Rossum403d68b2000-03-13 15:55:09 +000011269 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011270}
11271
Guido van Rossumd57fd912000-03-10 22:53:23 +000011272/* Concat to string or Unicode object giving a new Unicode object. */
11273
Alexander Belopolsky40018472011-02-26 01:02:56 +000011274PyObject *
11275PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011276{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011277 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011278 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011279 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011280
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011281 if (ensure_unicode(left) < 0)
11282 return NULL;
11283
11284 if (!PyUnicode_Check(right)) {
11285 PyErr_Format(PyExc_TypeError,
11286 "can only concatenate str (not \"%.200s\") to str",
11287 right->ob_type->tp_name);
11288 return NULL;
11289 }
11290 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011291 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011292
11293 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011294 if (left == unicode_empty)
11295 return PyUnicode_FromObject(right);
11296 if (right == unicode_empty)
11297 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011298
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011299 left_len = PyUnicode_GET_LENGTH(left);
11300 right_len = PyUnicode_GET_LENGTH(right);
11301 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011302 PyErr_SetString(PyExc_OverflowError,
11303 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011304 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011305 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011306 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011307
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011308 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11309 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011310 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011311
Guido van Rossumd57fd912000-03-10 22:53:23 +000011312 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011313 result = PyUnicode_New(new_len, maxchar);
11314 if (result == NULL)
11315 return NULL;
11316 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11317 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11318 assert(_PyUnicode_CheckConsistency(result, 1));
11319 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011320}
11321
Walter Dörwald1ab83302007-05-18 17:15:44 +000011322void
Victor Stinner23e56682011-10-03 03:54:37 +020011323PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011324{
Victor Stinner23e56682011-10-03 03:54:37 +020011325 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011326 Py_UCS4 maxchar, maxchar2;
11327 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011328
11329 if (p_left == NULL) {
11330 if (!PyErr_Occurred())
11331 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011332 return;
11333 }
Victor Stinner23e56682011-10-03 03:54:37 +020011334 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011335 if (right == NULL || left == NULL
11336 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011337 if (!PyErr_Occurred())
11338 PyErr_BadInternalCall();
11339 goto error;
11340 }
11341
Benjamin Petersonbac79492012-01-14 13:34:47 -050011342 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011343 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011344 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011345 goto error;
11346
Victor Stinner488fa492011-12-12 00:01:39 +010011347 /* Shortcuts */
11348 if (left == unicode_empty) {
11349 Py_DECREF(left);
11350 Py_INCREF(right);
11351 *p_left = right;
11352 return;
11353 }
11354 if (right == unicode_empty)
11355 return;
11356
11357 left_len = PyUnicode_GET_LENGTH(left);
11358 right_len = PyUnicode_GET_LENGTH(right);
11359 if (left_len > PY_SSIZE_T_MAX - right_len) {
11360 PyErr_SetString(PyExc_OverflowError,
11361 "strings are too large to concat");
11362 goto error;
11363 }
11364 new_len = left_len + right_len;
11365
11366 if (unicode_modifiable(left)
11367 && PyUnicode_CheckExact(right)
11368 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011369 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11370 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011371 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011372 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011373 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11374 {
11375 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011376 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011377 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011378
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011379 /* copy 'right' into the newly allocated area of 'left' */
11380 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011381 }
Victor Stinner488fa492011-12-12 00:01:39 +010011382 else {
11383 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11384 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011385 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011386
Victor Stinner488fa492011-12-12 00:01:39 +010011387 /* Concat the two Unicode strings */
11388 res = PyUnicode_New(new_len, maxchar);
11389 if (res == NULL)
11390 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011391 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11392 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011393 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011394 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011395 }
11396 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011397 return;
11398
11399error:
Victor Stinner488fa492011-12-12 00:01:39 +010011400 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011401}
11402
11403void
11404PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11405{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011406 PyUnicode_Append(pleft, right);
11407 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011408}
11409
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011410/*
11411Wraps stringlib_parse_args_finds() and additionally ensures that the
11412first argument is a unicode object.
11413*/
11414
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011415static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011416parse_args_finds_unicode(const char * function_name, PyObject *args,
11417 PyObject **substring,
11418 Py_ssize_t *start, Py_ssize_t *end)
11419{
11420 if(stringlib_parse_args_finds(function_name, args, substring,
11421 start, end)) {
11422 if (ensure_unicode(*substring) < 0)
11423 return 0;
11424 return 1;
11425 }
11426 return 0;
11427}
11428
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011429PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011430 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011432Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011433string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011434interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435
11436static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011437unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011439 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011440 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011441 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011442 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011443 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011444 void *buf1, *buf2;
11445 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011446
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011447 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011448 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011450 kind1 = PyUnicode_KIND(self);
11451 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011452 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011453 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011455 len1 = PyUnicode_GET_LENGTH(self);
11456 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011457 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011458 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011459 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011460
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011461 buf1 = PyUnicode_DATA(self);
11462 buf2 = PyUnicode_DATA(substring);
11463 if (kind2 != kind1) {
11464 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011465 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011466 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011467 }
11468 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011469 case PyUnicode_1BYTE_KIND:
11470 iresult = ucs1lib_count(
11471 ((Py_UCS1*)buf1) + start, end - start,
11472 buf2, len2, PY_SSIZE_T_MAX
11473 );
11474 break;
11475 case PyUnicode_2BYTE_KIND:
11476 iresult = ucs2lib_count(
11477 ((Py_UCS2*)buf1) + start, end - start,
11478 buf2, len2, PY_SSIZE_T_MAX
11479 );
11480 break;
11481 case PyUnicode_4BYTE_KIND:
11482 iresult = ucs4lib_count(
11483 ((Py_UCS4*)buf1) + start, end - start,
11484 buf2, len2, PY_SSIZE_T_MAX
11485 );
11486 break;
11487 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011488 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011489 }
11490
11491 result = PyLong_FromSsize_t(iresult);
11492
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011493 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011494 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011495
Guido van Rossumd57fd912000-03-10 22:53:23 +000011496 return result;
11497}
11498
INADA Naoki3ae20562017-01-16 20:41:20 +090011499/*[clinic input]
11500str.encode as unicode_encode
11501
11502 encoding: str(c_default="NULL") = 'utf-8'
11503 The encoding in which to encode the string.
11504 errors: str(c_default="NULL") = 'strict'
11505 The error handling scheme to use for encoding errors.
11506 The default is 'strict' meaning that encoding errors raise a
11507 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11508 'xmlcharrefreplace' as well as any other name registered with
11509 codecs.register_error that can handle UnicodeEncodeErrors.
11510
11511Encode the string using the codec registered for encoding.
11512[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513
11514static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011515unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011516/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011517{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011518 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011519}
11520
INADA Naoki3ae20562017-01-16 20:41:20 +090011521/*[clinic input]
11522str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523
INADA Naoki3ae20562017-01-16 20:41:20 +090011524 tabsize: int = 8
11525
11526Return a copy where all tab characters are expanded using spaces.
11527
11528If tabsize is not given, a tab size of 8 characters is assumed.
11529[clinic start generated code]*/
11530
11531static PyObject *
11532unicode_expandtabs_impl(PyObject *self, int tabsize)
11533/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011535 Py_ssize_t i, j, line_pos, src_len, incr;
11536 Py_UCS4 ch;
11537 PyObject *u;
11538 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011539 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011540 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011541
Antoine Pitrou22425222011-10-04 19:10:51 +020011542 if (PyUnicode_READY(self) == -1)
11543 return NULL;
11544
Thomas Wouters7e474022000-07-16 12:04:32 +000011545 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011546 src_len = PyUnicode_GET_LENGTH(self);
11547 i = j = line_pos = 0;
11548 kind = PyUnicode_KIND(self);
11549 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011550 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011551 for (; i < src_len; i++) {
11552 ch = PyUnicode_READ(kind, src_data, i);
11553 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011554 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011555 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011556 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011557 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011558 goto overflow;
11559 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011560 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011561 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011562 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011563 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011564 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011565 goto overflow;
11566 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011567 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011568 if (ch == '\n' || ch == '\r')
11569 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011570 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011571 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011572 if (!found)
11573 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011574
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011576 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577 if (!u)
11578 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011579 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011580
Antoine Pitroue71d5742011-10-04 15:55:09 +020011581 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011582
Antoine Pitroue71d5742011-10-04 15:55:09 +020011583 for (; i < src_len; i++) {
11584 ch = PyUnicode_READ(kind, src_data, i);
11585 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011586 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011587 incr = tabsize - (line_pos % tabsize);
11588 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011589 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011590 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011591 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011592 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011593 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011594 line_pos++;
11595 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011596 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011597 if (ch == '\n' || ch == '\r')
11598 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011600 }
11601 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011602 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011603
Antoine Pitroue71d5742011-10-04 15:55:09 +020011604 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011605 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11606 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011607}
11608
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011609PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011610 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611\n\
11612Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011613such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614arguments start and end are interpreted as in slice notation.\n\
11615\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011616Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617
11618static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011619unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011620{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011621 /* initialize variables to prevent gcc warning */
11622 PyObject *substring = NULL;
11623 Py_ssize_t start = 0;
11624 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011625 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011626
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011627 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011628 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011630 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011631 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011632
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011633 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011635 if (result == -2)
11636 return NULL;
11637
Christian Heimes217cfd12007-12-02 14:31:20 +000011638 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011639}
11640
11641static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011642unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011644 void *data;
11645 enum PyUnicode_Kind kind;
11646 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011647
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011648 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011649 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011650 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011651 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011652 if (PyUnicode_READY(self) == -1) {
11653 return NULL;
11654 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011655 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11656 PyErr_SetString(PyExc_IndexError, "string index out of range");
11657 return NULL;
11658 }
11659 kind = PyUnicode_KIND(self);
11660 data = PyUnicode_DATA(self);
11661 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011662 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011663}
11664
Guido van Rossumc2504932007-09-18 19:42:40 +000011665/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011666 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011667static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011668unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011669{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011670 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011671
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011672#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011673 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011674#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011675 if (_PyUnicode_HASH(self) != -1)
11676 return _PyUnicode_HASH(self);
11677 if (PyUnicode_READY(self) == -1)
11678 return -1;
animalizea1d14252019-01-02 20:16:06 +080011679
Christian Heimes985ecdc2013-11-20 11:46:18 +010011680 x = _Py_HashBytes(PyUnicode_DATA(self),
11681 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011682 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011683 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011684}
11685
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011686PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011687 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011688\n\
oldkaa0735f2018-02-02 16:52:55 +080011689Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011690such that sub is contained within S[start:end]. Optional\n\
11691arguments start and end are interpreted as in slice notation.\n\
11692\n\
11693Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694
11695static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011696unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011697{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011698 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011699 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011700 PyObject *substring = NULL;
11701 Py_ssize_t start = 0;
11702 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011703
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011704 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011705 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011706
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011707 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011708 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011709
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011710 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011711
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011712 if (result == -2)
11713 return NULL;
11714
Guido van Rossumd57fd912000-03-10 22:53:23 +000011715 if (result < 0) {
11716 PyErr_SetString(PyExc_ValueError, "substring not found");
11717 return NULL;
11718 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011719
Christian Heimes217cfd12007-12-02 14:31:20 +000011720 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721}
11722
INADA Naoki3ae20562017-01-16 20:41:20 +090011723/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011724str.isascii as unicode_isascii
11725
11726Return True if all characters in the string are ASCII, False otherwise.
11727
11728ASCII characters have code points in the range U+0000-U+007F.
11729Empty string is ASCII too.
11730[clinic start generated code]*/
11731
11732static PyObject *
11733unicode_isascii_impl(PyObject *self)
11734/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11735{
11736 if (PyUnicode_READY(self) == -1) {
11737 return NULL;
11738 }
11739 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11740}
11741
11742/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011743str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011744
INADA Naoki3ae20562017-01-16 20:41:20 +090011745Return True if the string is a lowercase string, False otherwise.
11746
11747A string is lowercase if all cased characters in the string are lowercase and
11748there is at least one cased character in the string.
11749[clinic start generated code]*/
11750
11751static PyObject *
11752unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011753/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011754{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011755 Py_ssize_t i, length;
11756 int kind;
11757 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011758 int cased;
11759
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011760 if (PyUnicode_READY(self) == -1)
11761 return NULL;
11762 length = PyUnicode_GET_LENGTH(self);
11763 kind = PyUnicode_KIND(self);
11764 data = PyUnicode_DATA(self);
11765
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011767 if (length == 1)
11768 return PyBool_FromLong(
11769 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011771 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011772 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011773 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011774
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011776 for (i = 0; i < length; i++) {
11777 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011778
Benjamin Peterson29060642009-01-31 22:14:21 +000011779 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011780 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011781 else if (!cased && Py_UNICODE_ISLOWER(ch))
11782 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011784 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785}
11786
INADA Naoki3ae20562017-01-16 20:41:20 +090011787/*[clinic input]
11788str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011789
INADA Naoki3ae20562017-01-16 20:41:20 +090011790Return True if the string is an uppercase string, False otherwise.
11791
11792A string is uppercase if all cased characters in the string are uppercase and
11793there is at least one cased character in the string.
11794[clinic start generated code]*/
11795
11796static PyObject *
11797unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011798/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011800 Py_ssize_t i, length;
11801 int kind;
11802 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803 int cased;
11804
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011805 if (PyUnicode_READY(self) == -1)
11806 return NULL;
11807 length = PyUnicode_GET_LENGTH(self);
11808 kind = PyUnicode_KIND(self);
11809 data = PyUnicode_DATA(self);
11810
Guido van Rossumd57fd912000-03-10 22:53:23 +000011811 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011812 if (length == 1)
11813 return PyBool_FromLong(
11814 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011815
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011816 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011817 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011818 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011819
Guido van Rossumd57fd912000-03-10 22:53:23 +000011820 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011821 for (i = 0; i < length; i++) {
11822 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011823
Benjamin Peterson29060642009-01-31 22:14:21 +000011824 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011825 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011826 else if (!cased && Py_UNICODE_ISUPPER(ch))
11827 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011828 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011829 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011830}
11831
INADA Naoki3ae20562017-01-16 20:41:20 +090011832/*[clinic input]
11833str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011834
INADA Naoki3ae20562017-01-16 20:41:20 +090011835Return True if the string is a title-cased string, False otherwise.
11836
11837In a title-cased string, upper- and title-case characters may only
11838follow uncased characters and lowercase characters only cased ones.
11839[clinic start generated code]*/
11840
11841static PyObject *
11842unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011843/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011844{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011845 Py_ssize_t i, length;
11846 int kind;
11847 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011848 int cased, previous_is_cased;
11849
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011850 if (PyUnicode_READY(self) == -1)
11851 return NULL;
11852 length = PyUnicode_GET_LENGTH(self);
11853 kind = PyUnicode_KIND(self);
11854 data = PyUnicode_DATA(self);
11855
Guido van Rossumd57fd912000-03-10 22:53:23 +000011856 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011857 if (length == 1) {
11858 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11859 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11860 (Py_UNICODE_ISUPPER(ch) != 0));
11861 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011862
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011863 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011864 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011865 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011866
Guido van Rossumd57fd912000-03-10 22:53:23 +000011867 cased = 0;
11868 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011869 for (i = 0; i < length; i++) {
11870 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011871
Benjamin Peterson29060642009-01-31 22:14:21 +000011872 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11873 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011874 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011875 previous_is_cased = 1;
11876 cased = 1;
11877 }
11878 else if (Py_UNICODE_ISLOWER(ch)) {
11879 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011880 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011881 previous_is_cased = 1;
11882 cased = 1;
11883 }
11884 else
11885 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011887 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011888}
11889
INADA Naoki3ae20562017-01-16 20:41:20 +090011890/*[clinic input]
11891str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011892
INADA Naoki3ae20562017-01-16 20:41:20 +090011893Return True if the string is a whitespace string, False otherwise.
11894
11895A string is whitespace if all characters in the string are whitespace and there
11896is at least one character in the string.
11897[clinic start generated code]*/
11898
11899static PyObject *
11900unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011901/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011903 Py_ssize_t i, length;
11904 int kind;
11905 void *data;
11906
11907 if (PyUnicode_READY(self) == -1)
11908 return NULL;
11909 length = PyUnicode_GET_LENGTH(self);
11910 kind = PyUnicode_KIND(self);
11911 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912
Guido van Rossumd57fd912000-03-10 22:53:23 +000011913 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 if (length == 1)
11915 return PyBool_FromLong(
11916 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011918 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011919 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011920 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011921
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011922 for (i = 0; i < length; i++) {
11923 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011924 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011925 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011927 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011928}
11929
INADA Naoki3ae20562017-01-16 20:41:20 +090011930/*[clinic input]
11931str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011932
INADA Naoki3ae20562017-01-16 20:41:20 +090011933Return True if the string is an alphabetic string, False otherwise.
11934
11935A string is alphabetic if all characters in the string are alphabetic and there
11936is at least one character in the string.
11937[clinic start generated code]*/
11938
11939static PyObject *
11940unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011941/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011942{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011943 Py_ssize_t i, length;
11944 int kind;
11945 void *data;
11946
11947 if (PyUnicode_READY(self) == -1)
11948 return NULL;
11949 length = PyUnicode_GET_LENGTH(self);
11950 kind = PyUnicode_KIND(self);
11951 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011952
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011953 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011954 if (length == 1)
11955 return PyBool_FromLong(
11956 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011957
11958 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011959 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011960 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011962 for (i = 0; i < length; i++) {
11963 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011964 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011965 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011966 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011967}
11968
INADA Naoki3ae20562017-01-16 20:41:20 +090011969/*[clinic input]
11970str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011971
INADA Naoki3ae20562017-01-16 20:41:20 +090011972Return True if the string is an alpha-numeric string, False otherwise.
11973
11974A string is alpha-numeric if all characters in the string are alpha-numeric and
11975there is at least one character in the string.
11976[clinic start generated code]*/
11977
11978static PyObject *
11979unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011980/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011981{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011982 int kind;
11983 void *data;
11984 Py_ssize_t len, i;
11985
11986 if (PyUnicode_READY(self) == -1)
11987 return NULL;
11988
11989 kind = PyUnicode_KIND(self);
11990 data = PyUnicode_DATA(self);
11991 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011992
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011993 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011994 if (len == 1) {
11995 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11996 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11997 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011998
11999 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012000 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012001 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012003 for (i = 0; i < len; i++) {
12004 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012005 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012006 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012007 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012008 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012009}
12010
INADA Naoki3ae20562017-01-16 20:41:20 +090012011/*[clinic input]
12012str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012013
INADA Naoki3ae20562017-01-16 20:41:20 +090012014Return True if the string is a decimal string, False otherwise.
12015
12016A string is a decimal string if all characters in the string are decimal and
12017there is at least one character in the string.
12018[clinic start generated code]*/
12019
12020static PyObject *
12021unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012022/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012023{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012024 Py_ssize_t i, length;
12025 int kind;
12026 void *data;
12027
12028 if (PyUnicode_READY(self) == -1)
12029 return NULL;
12030 length = PyUnicode_GET_LENGTH(self);
12031 kind = PyUnicode_KIND(self);
12032 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012033
Guido van Rossumd57fd912000-03-10 22:53:23 +000012034 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035 if (length == 1)
12036 return PyBool_FromLong(
12037 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012038
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012039 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012040 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012041 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012042
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012043 for (i = 0; i < length; i++) {
12044 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012045 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012046 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012047 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012048}
12049
INADA Naoki3ae20562017-01-16 20:41:20 +090012050/*[clinic input]
12051str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012052
INADA Naoki3ae20562017-01-16 20:41:20 +090012053Return True if the string is a digit string, False otherwise.
12054
12055A string is a digit string if all characters in the string are digits and there
12056is at least one character in the string.
12057[clinic start generated code]*/
12058
12059static PyObject *
12060unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012061/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012063 Py_ssize_t i, length;
12064 int kind;
12065 void *data;
12066
12067 if (PyUnicode_READY(self) == -1)
12068 return NULL;
12069 length = PyUnicode_GET_LENGTH(self);
12070 kind = PyUnicode_KIND(self);
12071 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012072
Guido van Rossumd57fd912000-03-10 22:53:23 +000012073 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012074 if (length == 1) {
12075 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12076 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12077 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012078
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012079 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012080 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012081 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012082
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012083 for (i = 0; i < length; i++) {
12084 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012085 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012086 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012087 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088}
12089
INADA Naoki3ae20562017-01-16 20:41:20 +090012090/*[clinic input]
12091str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012092
INADA Naoki3ae20562017-01-16 20:41:20 +090012093Return True if the string is a numeric string, False otherwise.
12094
12095A string is numeric if all characters in the string are numeric and there is at
12096least one character in the string.
12097[clinic start generated code]*/
12098
12099static PyObject *
12100unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012101/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012102{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012103 Py_ssize_t i, length;
12104 int kind;
12105 void *data;
12106
12107 if (PyUnicode_READY(self) == -1)
12108 return NULL;
12109 length = PyUnicode_GET_LENGTH(self);
12110 kind = PyUnicode_KIND(self);
12111 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012112
Guido van Rossumd57fd912000-03-10 22:53:23 +000012113 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012114 if (length == 1)
12115 return PyBool_FromLong(
12116 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012117
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012118 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012119 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012120 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012121
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012122 for (i = 0; i < length; i++) {
12123 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012124 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012125 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012126 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012127}
12128
Martin v. Löwis47383402007-08-15 07:32:56 +000012129int
12130PyUnicode_IsIdentifier(PyObject *self)
12131{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012132 int kind;
12133 void *data;
12134 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012135 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012136
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012137 if (PyUnicode_READY(self) == -1) {
12138 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012139 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012140 }
12141
12142 /* Special case for empty strings */
12143 if (PyUnicode_GET_LENGTH(self) == 0)
12144 return 0;
12145 kind = PyUnicode_KIND(self);
12146 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012147
12148 /* PEP 3131 says that the first character must be in
12149 XID_Start and subsequent characters in XID_Continue,
12150 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012151 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012152 letters, digits, underscore). However, given the current
12153 definition of XID_Start and XID_Continue, it is sufficient
12154 to check just for these, except that _ must be allowed
12155 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012156 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012157 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012158 return 0;
12159
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012160 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012161 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012162 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012163 return 1;
12164}
12165
INADA Naoki3ae20562017-01-16 20:41:20 +090012166/*[clinic input]
12167str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012168
INADA Naoki3ae20562017-01-16 20:41:20 +090012169Return True if the string is a valid Python identifier, False otherwise.
12170
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012171Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012172such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012173[clinic start generated code]*/
12174
12175static PyObject *
12176unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012177/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012178{
12179 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12180}
12181
INADA Naoki3ae20562017-01-16 20:41:20 +090012182/*[clinic input]
12183str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012184
INADA Naoki3ae20562017-01-16 20:41:20 +090012185Return True if the string is printable, False otherwise.
12186
12187A string is printable if all of its characters are considered printable in
12188repr() or if it is empty.
12189[clinic start generated code]*/
12190
12191static PyObject *
12192unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012193/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012194{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012195 Py_ssize_t i, length;
12196 int kind;
12197 void *data;
12198
12199 if (PyUnicode_READY(self) == -1)
12200 return NULL;
12201 length = PyUnicode_GET_LENGTH(self);
12202 kind = PyUnicode_KIND(self);
12203 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012204
12205 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012206 if (length == 1)
12207 return PyBool_FromLong(
12208 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012209
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012210 for (i = 0; i < length; i++) {
12211 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012212 Py_RETURN_FALSE;
12213 }
12214 }
12215 Py_RETURN_TRUE;
12216}
12217
INADA Naoki3ae20562017-01-16 20:41:20 +090012218/*[clinic input]
12219str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012220
INADA Naoki3ae20562017-01-16 20:41:20 +090012221 iterable: object
12222 /
12223
12224Concatenate any number of strings.
12225
Martin Panter91a88662017-01-24 00:30:06 +000012226The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012227The result is returned as a new string.
12228
12229Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12230[clinic start generated code]*/
12231
12232static PyObject *
12233unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012234/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012235{
INADA Naoki3ae20562017-01-16 20:41:20 +090012236 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012237}
12238
Martin v. Löwis18e16552006-02-15 17:27:45 +000012239static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012240unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012241{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012242 if (PyUnicode_READY(self) == -1)
12243 return -1;
12244 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012245}
12246
INADA Naoki3ae20562017-01-16 20:41:20 +090012247/*[clinic input]
12248str.ljust as unicode_ljust
12249
12250 width: Py_ssize_t
12251 fillchar: Py_UCS4 = ' '
12252 /
12253
12254Return a left-justified string of length width.
12255
12256Padding is done using the specified fill character (default is a space).
12257[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012258
12259static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012260unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12261/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012262{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012263 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012264 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012265
Victor Stinnerc4b49542011-12-11 22:44:26 +010012266 if (PyUnicode_GET_LENGTH(self) >= width)
12267 return unicode_result_unchanged(self);
12268
12269 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012270}
12271
INADA Naoki3ae20562017-01-16 20:41:20 +090012272/*[clinic input]
12273str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012274
INADA Naoki3ae20562017-01-16 20:41:20 +090012275Return a copy of the string converted to lowercase.
12276[clinic start generated code]*/
12277
12278static PyObject *
12279unicode_lower_impl(PyObject *self)
12280/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012281{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012282 if (PyUnicode_READY(self) == -1)
12283 return NULL;
12284 if (PyUnicode_IS_ASCII(self))
12285 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012286 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012287}
12288
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012289#define LEFTSTRIP 0
12290#define RIGHTSTRIP 1
12291#define BOTHSTRIP 2
12292
12293/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012294static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012295
INADA Naoki3ae20562017-01-16 20:41:20 +090012296#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012297
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012298/* externally visible for str.strip(unicode) */
12299PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012300_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012301{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012302 void *data;
12303 int kind;
12304 Py_ssize_t i, j, len;
12305 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012306 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012307
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012308 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12309 return NULL;
12310
12311 kind = PyUnicode_KIND(self);
12312 data = PyUnicode_DATA(self);
12313 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012314 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012315 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12316 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012317 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012318
Benjamin Peterson14339b62009-01-31 16:36:08 +000012319 i = 0;
12320 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012321 while (i < len) {
12322 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12323 if (!BLOOM(sepmask, ch))
12324 break;
12325 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12326 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012327 i++;
12328 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012329 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012330
Benjamin Peterson14339b62009-01-31 16:36:08 +000012331 j = len;
12332 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012333 j--;
12334 while (j >= i) {
12335 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12336 if (!BLOOM(sepmask, ch))
12337 break;
12338 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12339 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012340 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012341 }
12342
Benjamin Peterson29060642009-01-31 22:14:21 +000012343 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012344 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012345
Victor Stinner7931d9a2011-11-04 00:22:48 +010012346 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012347}
12348
12349PyObject*
12350PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12351{
12352 unsigned char *data;
12353 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012354 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012355
Victor Stinnerde636f32011-10-01 03:55:54 +020012356 if (PyUnicode_READY(self) == -1)
12357 return NULL;
12358
Victor Stinner684d5fd2012-05-03 02:32:34 +020012359 length = PyUnicode_GET_LENGTH(self);
12360 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012361
Victor Stinner684d5fd2012-05-03 02:32:34 +020012362 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012363 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012364
Victor Stinnerde636f32011-10-01 03:55:54 +020012365 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012366 PyErr_SetString(PyExc_IndexError, "string index out of range");
12367 return NULL;
12368 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012369 if (start >= length || end < start)
12370 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012371
Victor Stinner684d5fd2012-05-03 02:32:34 +020012372 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012373 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012374 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012375 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012376 }
12377 else {
12378 kind = PyUnicode_KIND(self);
12379 data = PyUnicode_1BYTE_DATA(self);
12380 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012381 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012382 length);
12383 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012384}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012385
12386static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012387do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012388{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012389 Py_ssize_t len, i, j;
12390
12391 if (PyUnicode_READY(self) == -1)
12392 return NULL;
12393
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012394 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012395
Victor Stinnercc7af722013-04-09 22:39:24 +020012396 if (PyUnicode_IS_ASCII(self)) {
12397 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12398
12399 i = 0;
12400 if (striptype != RIGHTSTRIP) {
12401 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012402 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012403 if (!_Py_ascii_whitespace[ch])
12404 break;
12405 i++;
12406 }
12407 }
12408
12409 j = len;
12410 if (striptype != LEFTSTRIP) {
12411 j--;
12412 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012413 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012414 if (!_Py_ascii_whitespace[ch])
12415 break;
12416 j--;
12417 }
12418 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012419 }
12420 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012421 else {
12422 int kind = PyUnicode_KIND(self);
12423 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012424
Victor Stinnercc7af722013-04-09 22:39:24 +020012425 i = 0;
12426 if (striptype != RIGHTSTRIP) {
12427 while (i < len) {
12428 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12429 if (!Py_UNICODE_ISSPACE(ch))
12430 break;
12431 i++;
12432 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012433 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012434
12435 j = len;
12436 if (striptype != LEFTSTRIP) {
12437 j--;
12438 while (j >= i) {
12439 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12440 if (!Py_UNICODE_ISSPACE(ch))
12441 break;
12442 j--;
12443 }
12444 j++;
12445 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012446 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012447
Victor Stinner7931d9a2011-11-04 00:22:48 +010012448 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012449}
12450
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012451
12452static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012453do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012454{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012455 if (sep != NULL && sep != Py_None) {
12456 if (PyUnicode_Check(sep))
12457 return _PyUnicode_XStrip(self, striptype, sep);
12458 else {
12459 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012460 "%s arg must be None or str",
12461 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012462 return NULL;
12463 }
12464 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012465
Benjamin Peterson14339b62009-01-31 16:36:08 +000012466 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012467}
12468
12469
INADA Naoki3ae20562017-01-16 20:41:20 +090012470/*[clinic input]
12471str.strip as unicode_strip
12472
12473 chars: object = None
12474 /
12475
Victor Stinner0c4a8282017-01-17 02:21:47 +010012476Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012477
12478If chars is given and not None, remove characters in chars instead.
12479[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012480
12481static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012482unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012483/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012484{
INADA Naoki3ae20562017-01-16 20:41:20 +090012485 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012486}
12487
12488
INADA Naoki3ae20562017-01-16 20:41:20 +090012489/*[clinic input]
12490str.lstrip as unicode_lstrip
12491
12492 chars: object = NULL
12493 /
12494
12495Return a copy of the string with leading whitespace removed.
12496
12497If chars is given and not None, remove characters in chars instead.
12498[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012499
12500static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012501unicode_lstrip_impl(PyObject *self, PyObject *chars)
12502/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012503{
INADA Naoki3ae20562017-01-16 20:41:20 +090012504 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012505}
12506
12507
INADA Naoki3ae20562017-01-16 20:41:20 +090012508/*[clinic input]
12509str.rstrip as unicode_rstrip
12510
12511 chars: object = NULL
12512 /
12513
12514Return a copy of the string with trailing whitespace removed.
12515
12516If chars is given and not None, remove characters in chars instead.
12517[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012518
12519static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012520unicode_rstrip_impl(PyObject *self, PyObject *chars)
12521/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012522{
INADA Naoki3ae20562017-01-16 20:41:20 +090012523 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012524}
12525
12526
Guido van Rossumd57fd912000-03-10 22:53:23 +000012527static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012528unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012529{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012530 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012531 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012532
Serhiy Storchaka05997252013-01-26 12:14:02 +020012533 if (len < 1)
12534 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012535
Victor Stinnerc4b49542011-12-11 22:44:26 +010012536 /* no repeat, return original string */
12537 if (len == 1)
12538 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012539
Benjamin Petersonbac79492012-01-14 13:34:47 -050012540 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012541 return NULL;
12542
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012543 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012544 PyErr_SetString(PyExc_OverflowError,
12545 "repeated string is too long");
12546 return NULL;
12547 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012548 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012549
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012550 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012551 if (!u)
12552 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012553 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012554
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012555 if (PyUnicode_GET_LENGTH(str) == 1) {
12556 const int kind = PyUnicode_KIND(str);
12557 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012558 if (kind == PyUnicode_1BYTE_KIND) {
12559 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012560 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012561 }
12562 else if (kind == PyUnicode_2BYTE_KIND) {
12563 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012564 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012565 ucs2[n] = fill_char;
12566 } else {
12567 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12568 assert(kind == PyUnicode_4BYTE_KIND);
12569 for (n = 0; n < len; ++n)
12570 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012571 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012572 }
12573 else {
12574 /* number of characters copied this far */
12575 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012576 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012577 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012578 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012579 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012580 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012581 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012582 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012583 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012584 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012585 }
12586
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012587 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012588 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012589}
12590
Alexander Belopolsky40018472011-02-26 01:02:56 +000012591PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012592PyUnicode_Replace(PyObject *str,
12593 PyObject *substr,
12594 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012595 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012596{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012597 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12598 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012599 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012600 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012601}
12602
INADA Naoki3ae20562017-01-16 20:41:20 +090012603/*[clinic input]
12604str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012605
INADA Naoki3ae20562017-01-16 20:41:20 +090012606 old: unicode
12607 new: unicode
12608 count: Py_ssize_t = -1
12609 Maximum number of occurrences to replace.
12610 -1 (the default value) means replace all occurrences.
12611 /
12612
12613Return a copy with all occurrences of substring old replaced by new.
12614
12615If the optional argument count is given, only the first count occurrences are
12616replaced.
12617[clinic start generated code]*/
12618
12619static PyObject *
12620unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12621 Py_ssize_t count)
12622/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012623{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012624 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012625 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012626 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012627}
12628
Alexander Belopolsky40018472011-02-26 01:02:56 +000012629static PyObject *
12630unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012631{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012632 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012633 Py_ssize_t isize;
12634 Py_ssize_t osize, squote, dquote, i, o;
12635 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012636 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012637 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012638
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012639 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012640 return NULL;
12641
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012642 isize = PyUnicode_GET_LENGTH(unicode);
12643 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012645 /* Compute length of output, quote characters, and
12646 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012647 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012648 max = 127;
12649 squote = dquote = 0;
12650 ikind = PyUnicode_KIND(unicode);
12651 for (i = 0; i < isize; i++) {
12652 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012653 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012654 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012655 case '\'': squote++; break;
12656 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012657 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012658 incr = 2;
12659 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012660 default:
12661 /* Fast-path ASCII */
12662 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012663 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012664 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012665 ;
12666 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012667 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012668 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012669 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012670 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012671 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012672 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012673 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012674 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012675 if (osize > PY_SSIZE_T_MAX - incr) {
12676 PyErr_SetString(PyExc_OverflowError,
12677 "string is too long to generate repr");
12678 return NULL;
12679 }
12680 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012681 }
12682
12683 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012684 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012685 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012686 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012687 if (dquote)
12688 /* Both squote and dquote present. Use squote,
12689 and escape them */
12690 osize += squote;
12691 else
12692 quote = '"';
12693 }
Victor Stinner55c08782013-04-14 18:45:39 +020012694 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012695
12696 repr = PyUnicode_New(osize, max);
12697 if (repr == NULL)
12698 return NULL;
12699 okind = PyUnicode_KIND(repr);
12700 odata = PyUnicode_DATA(repr);
12701
12702 PyUnicode_WRITE(okind, odata, 0, quote);
12703 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012704 if (unchanged) {
12705 _PyUnicode_FastCopyCharacters(repr, 1,
12706 unicode, 0,
12707 isize);
12708 }
12709 else {
12710 for (i = 0, o = 1; i < isize; i++) {
12711 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012712
Victor Stinner55c08782013-04-14 18:45:39 +020012713 /* Escape quotes and backslashes */
12714 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012715 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012716 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012717 continue;
12718 }
12719
12720 /* Map special whitespace to '\t', \n', '\r' */
12721 if (ch == '\t') {
12722 PyUnicode_WRITE(okind, odata, o++, '\\');
12723 PyUnicode_WRITE(okind, odata, o++, 't');
12724 }
12725 else if (ch == '\n') {
12726 PyUnicode_WRITE(okind, odata, o++, '\\');
12727 PyUnicode_WRITE(okind, odata, o++, 'n');
12728 }
12729 else if (ch == '\r') {
12730 PyUnicode_WRITE(okind, odata, o++, '\\');
12731 PyUnicode_WRITE(okind, odata, o++, 'r');
12732 }
12733
12734 /* Map non-printable US ASCII to '\xhh' */
12735 else if (ch < ' ' || ch == 0x7F) {
12736 PyUnicode_WRITE(okind, odata, o++, '\\');
12737 PyUnicode_WRITE(okind, odata, o++, 'x');
12738 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12739 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12740 }
12741
12742 /* Copy ASCII characters as-is */
12743 else if (ch < 0x7F) {
12744 PyUnicode_WRITE(okind, odata, o++, ch);
12745 }
12746
12747 /* Non-ASCII characters */
12748 else {
12749 /* Map Unicode whitespace and control characters
12750 (categories Z* and C* except ASCII space)
12751 */
12752 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12753 PyUnicode_WRITE(okind, odata, o++, '\\');
12754 /* Map 8-bit characters to '\xhh' */
12755 if (ch <= 0xff) {
12756 PyUnicode_WRITE(okind, odata, o++, 'x');
12757 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12758 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12759 }
12760 /* Map 16-bit characters to '\uxxxx' */
12761 else if (ch <= 0xffff) {
12762 PyUnicode_WRITE(okind, odata, o++, 'u');
12763 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12764 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12765 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12766 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12767 }
12768 /* Map 21-bit characters to '\U00xxxxxx' */
12769 else {
12770 PyUnicode_WRITE(okind, odata, o++, 'U');
12771 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12772 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12773 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12774 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12775 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12776 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12777 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12778 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12779 }
12780 }
12781 /* Copy characters as-is */
12782 else {
12783 PyUnicode_WRITE(okind, odata, o++, ch);
12784 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012785 }
12786 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012787 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012788 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012789 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012790 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012791}
12792
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012793PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012794 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012795\n\
12796Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012797such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012798arguments start and end are interpreted as in slice notation.\n\
12799\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012800Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012801
12802static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012803unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012804{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012805 /* initialize variables to prevent gcc warning */
12806 PyObject *substring = NULL;
12807 Py_ssize_t start = 0;
12808 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012809 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012810
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012811 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012812 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012813
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012814 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012815 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012816
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012817 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012819 if (result == -2)
12820 return NULL;
12821
Christian Heimes217cfd12007-12-02 14:31:20 +000012822 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012823}
12824
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012825PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012826 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012827\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012828Return the highest index in S where substring sub is found,\n\
12829such that sub is contained within S[start:end]. Optional\n\
12830arguments start and end are interpreted as in slice notation.\n\
12831\n\
12832Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012833
12834static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012835unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012836{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012837 /* initialize variables to prevent gcc warning */
12838 PyObject *substring = NULL;
12839 Py_ssize_t start = 0;
12840 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012841 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012842
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012843 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012844 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012845
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012846 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012847 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012848
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012849 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012850
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012851 if (result == -2)
12852 return NULL;
12853
Guido van Rossumd57fd912000-03-10 22:53:23 +000012854 if (result < 0) {
12855 PyErr_SetString(PyExc_ValueError, "substring not found");
12856 return NULL;
12857 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012858
Christian Heimes217cfd12007-12-02 14:31:20 +000012859 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012860}
12861
INADA Naoki3ae20562017-01-16 20:41:20 +090012862/*[clinic input]
12863str.rjust as unicode_rjust
12864
12865 width: Py_ssize_t
12866 fillchar: Py_UCS4 = ' '
12867 /
12868
12869Return a right-justified string of length width.
12870
12871Padding is done using the specified fill character (default is a space).
12872[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012873
12874static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012875unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12876/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012877{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012878 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012879 return NULL;
12880
Victor Stinnerc4b49542011-12-11 22:44:26 +010012881 if (PyUnicode_GET_LENGTH(self) >= width)
12882 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012883
Victor Stinnerc4b49542011-12-11 22:44:26 +010012884 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012885}
12886
Alexander Belopolsky40018472011-02-26 01:02:56 +000012887PyObject *
12888PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012889{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012890 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012891 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012892
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012893 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012894}
12895
INADA Naoki3ae20562017-01-16 20:41:20 +090012896/*[clinic input]
12897str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012898
INADA Naoki3ae20562017-01-16 20:41:20 +090012899 sep: object = None
12900 The delimiter according which to split the string.
12901 None (the default value) means split according to any whitespace,
12902 and discard empty strings from the result.
12903 maxsplit: Py_ssize_t = -1
12904 Maximum number of splits to do.
12905 -1 (the default value) means no limit.
12906
12907Return a list of the words in the string, using sep as the delimiter string.
12908[clinic start generated code]*/
12909
12910static PyObject *
12911unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12912/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012913{
INADA Naoki3ae20562017-01-16 20:41:20 +090012914 if (sep == Py_None)
12915 return split(self, NULL, maxsplit);
12916 if (PyUnicode_Check(sep))
12917 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012918
Victor Stinner998b8062018-09-12 00:23:25 +020012919 PyErr_Format(PyExc_TypeError,
12920 "must be str or None, not %.100s",
12921 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012922 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012923}
12924
Thomas Wouters477c8d52006-05-27 19:21:47 +000012925PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012926PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012927{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012928 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012929 int kind1, kind2;
12930 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012931 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012932
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012933 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012934 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012935
Victor Stinner14f8f022011-10-05 20:58:25 +020012936 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012937 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012938 len1 = PyUnicode_GET_LENGTH(str_obj);
12939 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012940 if (kind1 < kind2 || len1 < len2) {
12941 _Py_INCREF_UNICODE_EMPTY();
12942 if (!unicode_empty)
12943 out = NULL;
12944 else {
12945 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12946 Py_DECREF(unicode_empty);
12947 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012948 return out;
12949 }
12950 buf1 = PyUnicode_DATA(str_obj);
12951 buf2 = PyUnicode_DATA(sep_obj);
12952 if (kind2 != kind1) {
12953 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12954 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012955 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012956 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012957
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012958 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012959 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012960 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12961 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12962 else
12963 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012964 break;
12965 case PyUnicode_2BYTE_KIND:
12966 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12967 break;
12968 case PyUnicode_4BYTE_KIND:
12969 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12970 break;
12971 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012972 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012973 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012974
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012975 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012976 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012977
12978 return out;
12979}
12980
12981
12982PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012983PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012984{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012985 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012986 int kind1, kind2;
12987 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012988 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012989
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012990 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012991 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012992
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012993 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012994 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012995 len1 = PyUnicode_GET_LENGTH(str_obj);
12996 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012997 if (kind1 < kind2 || len1 < len2) {
12998 _Py_INCREF_UNICODE_EMPTY();
12999 if (!unicode_empty)
13000 out = NULL;
13001 else {
13002 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13003 Py_DECREF(unicode_empty);
13004 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013005 return out;
13006 }
13007 buf1 = PyUnicode_DATA(str_obj);
13008 buf2 = PyUnicode_DATA(sep_obj);
13009 if (kind2 != kind1) {
13010 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13011 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013012 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013013 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013014
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013015 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013016 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013017 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13018 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13019 else
13020 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013021 break;
13022 case PyUnicode_2BYTE_KIND:
13023 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13024 break;
13025 case PyUnicode_4BYTE_KIND:
13026 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13027 break;
13028 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013029 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013030 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013031
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013032 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013033 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013034
13035 return out;
13036}
13037
INADA Naoki3ae20562017-01-16 20:41:20 +090013038/*[clinic input]
13039str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013040
INADA Naoki3ae20562017-01-16 20:41:20 +090013041 sep: object
13042 /
13043
13044Partition the string into three parts using the given separator.
13045
13046This will search for the separator in the string. If the separator is found,
13047returns a 3-tuple containing the part before the separator, the separator
13048itself, and the part after it.
13049
13050If the separator is not found, returns a 3-tuple containing the original string
13051and two empty strings.
13052[clinic start generated code]*/
13053
13054static PyObject *
13055unicode_partition(PyObject *self, PyObject *sep)
13056/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013057{
INADA Naoki3ae20562017-01-16 20:41:20 +090013058 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013059}
13060
INADA Naoki3ae20562017-01-16 20:41:20 +090013061/*[clinic input]
13062str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013063
INADA Naoki3ae20562017-01-16 20:41:20 +090013064Partition the string into three parts using the given separator.
13065
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013066This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013067the separator is found, returns a 3-tuple containing the part before the
13068separator, the separator itself, and the part after it.
13069
13070If the separator is not found, returns a 3-tuple containing two empty strings
13071and the original string.
13072[clinic start generated code]*/
13073
13074static PyObject *
13075unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013076/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013077{
INADA Naoki3ae20562017-01-16 20:41:20 +090013078 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013079}
13080
Alexander Belopolsky40018472011-02-26 01:02:56 +000013081PyObject *
13082PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013083{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013084 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013085 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013086
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013087 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013088}
13089
INADA Naoki3ae20562017-01-16 20:41:20 +090013090/*[clinic input]
13091str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013092
INADA Naoki3ae20562017-01-16 20:41:20 +090013093Return a list of the words in the string, using sep as the delimiter string.
13094
13095Splits are done starting at the end of the string and working to the front.
13096[clinic start generated code]*/
13097
13098static PyObject *
13099unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13100/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013101{
INADA Naoki3ae20562017-01-16 20:41:20 +090013102 if (sep == Py_None)
13103 return rsplit(self, NULL, maxsplit);
13104 if (PyUnicode_Check(sep))
13105 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013106
Victor Stinner998b8062018-09-12 00:23:25 +020013107 PyErr_Format(PyExc_TypeError,
13108 "must be str or None, not %.100s",
13109 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013110 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013111}
13112
INADA Naoki3ae20562017-01-16 20:41:20 +090013113/*[clinic input]
13114str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013115
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013116 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013117
13118Return a list of the lines in the string, breaking at line boundaries.
13119
13120Line breaks are not included in the resulting list unless keepends is given and
13121true.
13122[clinic start generated code]*/
13123
13124static PyObject *
13125unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013126/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013127{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013128 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129}
13130
13131static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013132PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013133{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013134 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013135}
13136
INADA Naoki3ae20562017-01-16 20:41:20 +090013137/*[clinic input]
13138str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013139
INADA Naoki3ae20562017-01-16 20:41:20 +090013140Convert uppercase characters to lowercase and lowercase characters to uppercase.
13141[clinic start generated code]*/
13142
13143static PyObject *
13144unicode_swapcase_impl(PyObject *self)
13145/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013146{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013147 if (PyUnicode_READY(self) == -1)
13148 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013149 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013150}
13151
Larry Hastings61272b72014-01-07 12:41:53 -080013152/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013153
Larry Hastings31826802013-10-19 00:09:25 -070013154@staticmethod
13155str.maketrans as unicode_maketrans
13156
13157 x: object
13158
13159 y: unicode=NULL
13160
13161 z: unicode=NULL
13162
13163 /
13164
13165Return a translation table usable for str.translate().
13166
13167If there is only one argument, it must be a dictionary mapping Unicode
13168ordinals (integers) or characters to Unicode ordinals, strings or None.
13169Character keys will be then converted to ordinals.
13170If there are two arguments, they must be strings of equal length, and
13171in the resulting dictionary, each character in x will be mapped to the
13172character at the same position in y. If there is a third argument, it
13173must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013174[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013175
Larry Hastings31826802013-10-19 00:09:25 -070013176static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013177unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013178/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013179{
Georg Brandlceee0772007-11-27 23:48:05 +000013180 PyObject *new = NULL, *key, *value;
13181 Py_ssize_t i = 0;
13182 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013183
Georg Brandlceee0772007-11-27 23:48:05 +000013184 new = PyDict_New();
13185 if (!new)
13186 return NULL;
13187 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013188 int x_kind, y_kind, z_kind;
13189 void *x_data, *y_data, *z_data;
13190
Georg Brandlceee0772007-11-27 23:48:05 +000013191 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013192 if (!PyUnicode_Check(x)) {
13193 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13194 "be a string if there is a second argument");
13195 goto err;
13196 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013197 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013198 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13199 "arguments must have equal length");
13200 goto err;
13201 }
13202 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013203 x_kind = PyUnicode_KIND(x);
13204 y_kind = PyUnicode_KIND(y);
13205 x_data = PyUnicode_DATA(x);
13206 y_data = PyUnicode_DATA(y);
13207 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13208 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013209 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013210 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013211 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013212 if (!value) {
13213 Py_DECREF(key);
13214 goto err;
13215 }
Georg Brandlceee0772007-11-27 23:48:05 +000013216 res = PyDict_SetItem(new, key, value);
13217 Py_DECREF(key);
13218 Py_DECREF(value);
13219 if (res < 0)
13220 goto err;
13221 }
13222 /* create entries for deleting chars in z */
13223 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013224 z_kind = PyUnicode_KIND(z);
13225 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013226 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013227 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013228 if (!key)
13229 goto err;
13230 res = PyDict_SetItem(new, key, Py_None);
13231 Py_DECREF(key);
13232 if (res < 0)
13233 goto err;
13234 }
13235 }
13236 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013237 int kind;
13238 void *data;
13239
Georg Brandlceee0772007-11-27 23:48:05 +000013240 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013241 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013242 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13243 "to maketrans it must be a dict");
13244 goto err;
13245 }
13246 /* copy entries into the new dict, converting string keys to int keys */
13247 while (PyDict_Next(x, &i, &key, &value)) {
13248 if (PyUnicode_Check(key)) {
13249 /* convert string keys to integer keys */
13250 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013251 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013252 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13253 "table must be of length 1");
13254 goto err;
13255 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013256 kind = PyUnicode_KIND(key);
13257 data = PyUnicode_DATA(key);
13258 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013259 if (!newkey)
13260 goto err;
13261 res = PyDict_SetItem(new, newkey, value);
13262 Py_DECREF(newkey);
13263 if (res < 0)
13264 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013265 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013266 /* just keep integer keys */
13267 if (PyDict_SetItem(new, key, value) < 0)
13268 goto err;
13269 } else {
13270 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13271 "be strings or integers");
13272 goto err;
13273 }
13274 }
13275 }
13276 return new;
13277 err:
13278 Py_DECREF(new);
13279 return NULL;
13280}
13281
INADA Naoki3ae20562017-01-16 20:41:20 +090013282/*[clinic input]
13283str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013284
INADA Naoki3ae20562017-01-16 20:41:20 +090013285 table: object
13286 Translation table, which must be a mapping of Unicode ordinals to
13287 Unicode ordinals, strings, or None.
13288 /
13289
13290Replace each character in the string using the given translation table.
13291
13292The table must implement lookup/indexing via __getitem__, for instance a
13293dictionary or list. If this operation raises LookupError, the character is
13294left untouched. Characters mapped to None are deleted.
13295[clinic start generated code]*/
13296
13297static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013298unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013299/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013300{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013301 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013302}
13303
INADA Naoki3ae20562017-01-16 20:41:20 +090013304/*[clinic input]
13305str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013306
INADA Naoki3ae20562017-01-16 20:41:20 +090013307Return a copy of the string converted to uppercase.
13308[clinic start generated code]*/
13309
13310static PyObject *
13311unicode_upper_impl(PyObject *self)
13312/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013313{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013314 if (PyUnicode_READY(self) == -1)
13315 return NULL;
13316 if (PyUnicode_IS_ASCII(self))
13317 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013318 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013319}
13320
INADA Naoki3ae20562017-01-16 20:41:20 +090013321/*[clinic input]
13322str.zfill as unicode_zfill
13323
13324 width: Py_ssize_t
13325 /
13326
13327Pad a numeric string with zeros on the left, to fill a field of the given width.
13328
13329The string is never truncated.
13330[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013331
13332static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013333unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013334/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013335{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013336 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013337 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013338 int kind;
13339 void *data;
13340 Py_UCS4 chr;
13341
Benjamin Petersonbac79492012-01-14 13:34:47 -050013342 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013343 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013344
Victor Stinnerc4b49542011-12-11 22:44:26 +010013345 if (PyUnicode_GET_LENGTH(self) >= width)
13346 return unicode_result_unchanged(self);
13347
13348 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013349
13350 u = pad(self, fill, 0, '0');
13351
Walter Dörwald068325e2002-04-15 13:36:47 +000013352 if (u == NULL)
13353 return NULL;
13354
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013355 kind = PyUnicode_KIND(u);
13356 data = PyUnicode_DATA(u);
13357 chr = PyUnicode_READ(kind, data, fill);
13358
13359 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013360 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013361 PyUnicode_WRITE(kind, data, 0, chr);
13362 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013363 }
13364
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013365 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013366 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013367}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013368
13369#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013370static PyObject *
13371unicode__decimal2ascii(PyObject *self)
13372{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013373 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013374}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013375#endif
13376
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013377PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013378 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013379\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013380Return True if S starts with the specified prefix, False otherwise.\n\
13381With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013382With optional end, stop comparing S at that position.\n\
13383prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013384
13385static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013386unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013387 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013388{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013389 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013390 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013391 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013392 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013393 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013394
Jesus Ceaac451502011-04-20 17:09:23 +020013395 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013396 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013397 if (PyTuple_Check(subobj)) {
13398 Py_ssize_t i;
13399 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013400 substring = PyTuple_GET_ITEM(subobj, i);
13401 if (!PyUnicode_Check(substring)) {
13402 PyErr_Format(PyExc_TypeError,
13403 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013404 "not %.100s",
13405 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013406 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013407 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013408 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013409 if (result == -1)
13410 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013411 if (result) {
13412 Py_RETURN_TRUE;
13413 }
13414 }
13415 /* nothing matched */
13416 Py_RETURN_FALSE;
13417 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013418 if (!PyUnicode_Check(subobj)) {
13419 PyErr_Format(PyExc_TypeError,
13420 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013421 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013422 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013423 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013424 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013425 if (result == -1)
13426 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013427 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013428}
13429
13430
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013431PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013432 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013433\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013434Return True if S ends with the specified suffix, False otherwise.\n\
13435With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013436With optional end, stop comparing S at that position.\n\
13437suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013438
13439static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013440unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013441 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013442{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013443 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013444 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013445 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013446 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013447 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013448
Jesus Ceaac451502011-04-20 17:09:23 +020013449 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013450 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013451 if (PyTuple_Check(subobj)) {
13452 Py_ssize_t i;
13453 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013454 substring = PyTuple_GET_ITEM(subobj, i);
13455 if (!PyUnicode_Check(substring)) {
13456 PyErr_Format(PyExc_TypeError,
13457 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013458 "not %.100s",
13459 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013460 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013461 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013462 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013463 if (result == -1)
13464 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013465 if (result) {
13466 Py_RETURN_TRUE;
13467 }
13468 }
13469 Py_RETURN_FALSE;
13470 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013471 if (!PyUnicode_Check(subobj)) {
13472 PyErr_Format(PyExc_TypeError,
13473 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013474 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013475 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013476 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013477 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013478 if (result == -1)
13479 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013480 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013481}
13482
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013483static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013484_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013485{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013486 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13487 writer->data = PyUnicode_DATA(writer->buffer);
13488
13489 if (!writer->readonly) {
13490 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013491 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013492 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013493 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013494 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13495 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13496 writer->kind = PyUnicode_WCHAR_KIND;
13497 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13498
Victor Stinner8f674cc2013-04-17 23:02:17 +020013499 /* Copy-on-write mode: set buffer size to 0 so
13500 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13501 * next write. */
13502 writer->size = 0;
13503 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013504}
13505
Victor Stinnerd3f08822012-05-29 12:57:52 +020013506void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013507_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013508{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013509 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013510
13511 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013512 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013513
13514 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13515 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13516 writer->kind = PyUnicode_WCHAR_KIND;
13517 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013518}
13519
Inada Naoki770847a2019-06-24 12:30:24 +090013520// Initialize _PyUnicodeWriter with initial buffer
13521static inline void
13522_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13523{
13524 memset(writer, 0, sizeof(*writer));
13525 writer->buffer = buffer;
13526 _PyUnicodeWriter_Update(writer);
13527 writer->min_length = writer->size;
13528}
13529
Victor Stinnerd3f08822012-05-29 12:57:52 +020013530int
13531_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13532 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013533{
13534 Py_ssize_t newlen;
13535 PyObject *newbuffer;
13536
Victor Stinner2740e462016-09-06 16:58:36 -070013537 assert(maxchar <= MAX_UNICODE);
13538
Victor Stinnerca9381e2015-09-22 00:58:32 +020013539 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013540 assert((maxchar > writer->maxchar && length >= 0)
13541 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013542
Victor Stinner202fdca2012-05-07 12:47:02 +020013543 if (length > PY_SSIZE_T_MAX - writer->pos) {
13544 PyErr_NoMemory();
13545 return -1;
13546 }
13547 newlen = writer->pos + length;
13548
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013549 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013550
Victor Stinnerd3f08822012-05-29 12:57:52 +020013551 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013552 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013553 if (writer->overallocate
13554 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13555 /* overallocate to limit the number of realloc() */
13556 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013557 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013558 if (newlen < writer->min_length)
13559 newlen = writer->min_length;
13560
Victor Stinnerd3f08822012-05-29 12:57:52 +020013561 writer->buffer = PyUnicode_New(newlen, maxchar);
13562 if (writer->buffer == NULL)
13563 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013564 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013565 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013566 if (writer->overallocate
13567 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13568 /* overallocate to limit the number of realloc() */
13569 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013570 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013571 if (newlen < writer->min_length)
13572 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013573
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013574 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013575 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013576 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013577 newbuffer = PyUnicode_New(newlen, maxchar);
13578 if (newbuffer == NULL)
13579 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013580 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13581 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013582 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013583 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013584 }
13585 else {
13586 newbuffer = resize_compact(writer->buffer, newlen);
13587 if (newbuffer == NULL)
13588 return -1;
13589 }
13590 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013591 }
13592 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013593 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013594 newbuffer = PyUnicode_New(writer->size, maxchar);
13595 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013596 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013597 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13598 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013599 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013600 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013601 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013602 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013603
13604#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013605}
13606
Victor Stinnerca9381e2015-09-22 00:58:32 +020013607int
13608_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13609 enum PyUnicode_Kind kind)
13610{
13611 Py_UCS4 maxchar;
13612
13613 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13614 assert(writer->kind < kind);
13615
13616 switch (kind)
13617 {
13618 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13619 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13620 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13621 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013622 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013623 }
13624
13625 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13626}
13627
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013628static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013629_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013630{
Victor Stinner2740e462016-09-06 16:58:36 -070013631 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013632 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13633 return -1;
13634 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13635 writer->pos++;
13636 return 0;
13637}
13638
13639int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013640_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13641{
13642 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13643}
13644
13645int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013646_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13647{
13648 Py_UCS4 maxchar;
13649 Py_ssize_t len;
13650
13651 if (PyUnicode_READY(str) == -1)
13652 return -1;
13653 len = PyUnicode_GET_LENGTH(str);
13654 if (len == 0)
13655 return 0;
13656 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13657 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013658 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013659 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013660 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013661 Py_INCREF(str);
13662 writer->buffer = str;
13663 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013664 writer->pos += len;
13665 return 0;
13666 }
13667 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13668 return -1;
13669 }
13670 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13671 str, 0, len);
13672 writer->pos += len;
13673 return 0;
13674}
13675
Victor Stinnere215d962012-10-06 23:03:36 +020013676int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013677_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13678 Py_ssize_t start, Py_ssize_t end)
13679{
13680 Py_UCS4 maxchar;
13681 Py_ssize_t len;
13682
13683 if (PyUnicode_READY(str) == -1)
13684 return -1;
13685
13686 assert(0 <= start);
13687 assert(end <= PyUnicode_GET_LENGTH(str));
13688 assert(start <= end);
13689
13690 if (end == 0)
13691 return 0;
13692
13693 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13694 return _PyUnicodeWriter_WriteStr(writer, str);
13695
13696 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13697 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13698 else
13699 maxchar = writer->maxchar;
13700 len = end - start;
13701
13702 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13703 return -1;
13704
13705 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13706 str, start, len);
13707 writer->pos += len;
13708 return 0;
13709}
13710
13711int
Victor Stinner4a587072013-11-19 12:54:53 +010013712_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13713 const char *ascii, Py_ssize_t len)
13714{
13715 if (len == -1)
13716 len = strlen(ascii);
13717
13718 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13719
13720 if (writer->buffer == NULL && !writer->overallocate) {
13721 PyObject *str;
13722
13723 str = _PyUnicode_FromASCII(ascii, len);
13724 if (str == NULL)
13725 return -1;
13726
13727 writer->readonly = 1;
13728 writer->buffer = str;
13729 _PyUnicodeWriter_Update(writer);
13730 writer->pos += len;
13731 return 0;
13732 }
13733
13734 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13735 return -1;
13736
13737 switch (writer->kind)
13738 {
13739 case PyUnicode_1BYTE_KIND:
13740 {
13741 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13742 Py_UCS1 *data = writer->data;
13743
Christian Heimesf051e432016-09-13 20:22:02 +020013744 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013745 break;
13746 }
13747 case PyUnicode_2BYTE_KIND:
13748 {
13749 _PyUnicode_CONVERT_BYTES(
13750 Py_UCS1, Py_UCS2,
13751 ascii, ascii + len,
13752 (Py_UCS2 *)writer->data + writer->pos);
13753 break;
13754 }
13755 case PyUnicode_4BYTE_KIND:
13756 {
13757 _PyUnicode_CONVERT_BYTES(
13758 Py_UCS1, Py_UCS4,
13759 ascii, ascii + len,
13760 (Py_UCS4 *)writer->data + writer->pos);
13761 break;
13762 }
13763 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013764 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013765 }
13766
13767 writer->pos += len;
13768 return 0;
13769}
13770
13771int
13772_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13773 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013774{
13775 Py_UCS4 maxchar;
13776
13777 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13778 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13779 return -1;
13780 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13781 writer->pos += len;
13782 return 0;
13783}
13784
Victor Stinnerd3f08822012-05-29 12:57:52 +020013785PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013786_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013787{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013788 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013789
Victor Stinnerd3f08822012-05-29 12:57:52 +020013790 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013791 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013792 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013793 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013794
13795 str = writer->buffer;
13796 writer->buffer = NULL;
13797
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013798 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013799 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13800 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013801 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013802
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013803 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13804 PyObject *str2;
13805 str2 = resize_compact(str, writer->pos);
13806 if (str2 == NULL) {
13807 Py_DECREF(str);
13808 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013809 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013810 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013811 }
13812
Victor Stinner15a0bd32013-07-08 22:29:55 +020013813 assert(_PyUnicode_CheckConsistency(str, 1));
13814 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013815}
13816
Victor Stinnerd3f08822012-05-29 12:57:52 +020013817void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013818_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013819{
13820 Py_CLEAR(writer->buffer);
13821}
13822
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013823#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013824
13825PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013826 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013827\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013828Return a formatted version of S, using substitutions from args and kwargs.\n\
13829The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013830
Eric Smith27bbca62010-11-04 17:06:58 +000013831PyDoc_STRVAR(format_map__doc__,
13832 "S.format_map(mapping) -> str\n\
13833\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013834Return a formatted version of S, using substitutions from mapping.\n\
13835The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013836
INADA Naoki3ae20562017-01-16 20:41:20 +090013837/*[clinic input]
13838str.__format__ as unicode___format__
13839
13840 format_spec: unicode
13841 /
13842
13843Return a formatted version of the string as described by format_spec.
13844[clinic start generated code]*/
13845
Eric Smith4a7d76d2008-05-30 18:10:19 +000013846static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013847unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013848/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013849{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013850 _PyUnicodeWriter writer;
13851 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013852
Victor Stinnerd3f08822012-05-29 12:57:52 +020013853 if (PyUnicode_READY(self) == -1)
13854 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013855 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013856 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13857 self, format_spec, 0,
13858 PyUnicode_GET_LENGTH(format_spec));
13859 if (ret == -1) {
13860 _PyUnicodeWriter_Dealloc(&writer);
13861 return NULL;
13862 }
13863 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013864}
13865
INADA Naoki3ae20562017-01-16 20:41:20 +090013866/*[clinic input]
13867str.__sizeof__ as unicode_sizeof
13868
13869Return the size of the string in memory, in bytes.
13870[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013871
13872static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013873unicode_sizeof_impl(PyObject *self)
13874/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013875{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013876 Py_ssize_t size;
13877
13878 /* If it's a compact object, account for base structure +
13879 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013880 if (PyUnicode_IS_COMPACT_ASCII(self))
13881 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13882 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013883 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013884 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013885 else {
13886 /* If it is a two-block object, account for base object, and
13887 for character block if present. */
13888 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013889 if (_PyUnicode_DATA_ANY(self))
13890 size += (PyUnicode_GET_LENGTH(self) + 1) *
13891 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013892 }
13893 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013894 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013895 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13896 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13897 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13898 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013899
13900 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013901}
13902
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013903static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013904unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013905{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013906 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013907 if (!copy)
13908 return NULL;
13909 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013910}
13911
Guido van Rossumd57fd912000-03-10 22:53:23 +000013912static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013913 UNICODE_ENCODE_METHODDEF
13914 UNICODE_REPLACE_METHODDEF
13915 UNICODE_SPLIT_METHODDEF
13916 UNICODE_RSPLIT_METHODDEF
13917 UNICODE_JOIN_METHODDEF
13918 UNICODE_CAPITALIZE_METHODDEF
13919 UNICODE_CASEFOLD_METHODDEF
13920 UNICODE_TITLE_METHODDEF
13921 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013922 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013923 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013924 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013925 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013926 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013927 UNICODE_LJUST_METHODDEF
13928 UNICODE_LOWER_METHODDEF
13929 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013930 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13931 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013932 UNICODE_RJUST_METHODDEF
13933 UNICODE_RSTRIP_METHODDEF
13934 UNICODE_RPARTITION_METHODDEF
13935 UNICODE_SPLITLINES_METHODDEF
13936 UNICODE_STRIP_METHODDEF
13937 UNICODE_SWAPCASE_METHODDEF
13938 UNICODE_TRANSLATE_METHODDEF
13939 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013940 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13941 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090013942 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013943 UNICODE_ISLOWER_METHODDEF
13944 UNICODE_ISUPPER_METHODDEF
13945 UNICODE_ISTITLE_METHODDEF
13946 UNICODE_ISSPACE_METHODDEF
13947 UNICODE_ISDECIMAL_METHODDEF
13948 UNICODE_ISDIGIT_METHODDEF
13949 UNICODE_ISNUMERIC_METHODDEF
13950 UNICODE_ISALPHA_METHODDEF
13951 UNICODE_ISALNUM_METHODDEF
13952 UNICODE_ISIDENTIFIER_METHODDEF
13953 UNICODE_ISPRINTABLE_METHODDEF
13954 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020013955 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013956 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013957 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013958 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013959 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013960#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013961 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013962 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013963#endif
13964
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013965 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013966 {NULL, NULL}
13967};
13968
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013969static PyObject *
13970unicode_mod(PyObject *v, PyObject *w)
13971{
Brian Curtindfc80e32011-08-10 20:28:54 -050013972 if (!PyUnicode_Check(v))
13973 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013974 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013975}
13976
13977static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013978 0, /*nb_add*/
13979 0, /*nb_subtract*/
13980 0, /*nb_multiply*/
13981 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013982};
13983
Guido van Rossumd57fd912000-03-10 22:53:23 +000013984static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013985 (lenfunc) unicode_length, /* sq_length */
13986 PyUnicode_Concat, /* sq_concat */
13987 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13988 (ssizeargfunc) unicode_getitem, /* sq_item */
13989 0, /* sq_slice */
13990 0, /* sq_ass_item */
13991 0, /* sq_ass_slice */
13992 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013993};
13994
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013995static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013996unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013997{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013998 if (PyUnicode_READY(self) == -1)
13999 return NULL;
14000
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014001 if (PyIndex_Check(item)) {
14002 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014003 if (i == -1 && PyErr_Occurred())
14004 return NULL;
14005 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014006 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014007 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014008 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014009 Py_ssize_t start, stop, step, slicelength, i;
14010 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014011 PyObject *result;
14012 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014013 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014014 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014015
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014016 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014017 return NULL;
14018 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014019 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14020 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014021
14022 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014023 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014024 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014025 slicelength == PyUnicode_GET_LENGTH(self)) {
14026 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014027 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014028 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014029 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014030 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014031 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014032 src_kind = PyUnicode_KIND(self);
14033 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014034 if (!PyUnicode_IS_ASCII(self)) {
14035 kind_limit = kind_maxchar_limit(src_kind);
14036 max_char = 0;
14037 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14038 ch = PyUnicode_READ(src_kind, src_data, cur);
14039 if (ch > max_char) {
14040 max_char = ch;
14041 if (max_char >= kind_limit)
14042 break;
14043 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014044 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014045 }
Victor Stinner55c99112011-10-13 01:17:06 +020014046 else
14047 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014048 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014049 if (result == NULL)
14050 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014051 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014052 dest_data = PyUnicode_DATA(result);
14053
14054 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014055 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14056 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014057 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014058 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014059 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014060 } else {
14061 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14062 return NULL;
14063 }
14064}
14065
14066static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014067 (lenfunc)unicode_length, /* mp_length */
14068 (binaryfunc)unicode_subscript, /* mp_subscript */
14069 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014070};
14071
Guido van Rossumd57fd912000-03-10 22:53:23 +000014072
Guido van Rossumd57fd912000-03-10 22:53:23 +000014073/* Helpers for PyUnicode_Format() */
14074
Victor Stinnera47082312012-10-04 02:19:54 +020014075struct unicode_formatter_t {
14076 PyObject *args;
14077 int args_owned;
14078 Py_ssize_t arglen, argidx;
14079 PyObject *dict;
14080
14081 enum PyUnicode_Kind fmtkind;
14082 Py_ssize_t fmtcnt, fmtpos;
14083 void *fmtdata;
14084 PyObject *fmtstr;
14085
14086 _PyUnicodeWriter writer;
14087};
14088
14089struct unicode_format_arg_t {
14090 Py_UCS4 ch;
14091 int flags;
14092 Py_ssize_t width;
14093 int prec;
14094 int sign;
14095};
14096
Guido van Rossumd57fd912000-03-10 22:53:23 +000014097static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014098unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014099{
Victor Stinnera47082312012-10-04 02:19:54 +020014100 Py_ssize_t argidx = ctx->argidx;
14101
14102 if (argidx < ctx->arglen) {
14103 ctx->argidx++;
14104 if (ctx->arglen < 0)
14105 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014106 else
Victor Stinnera47082312012-10-04 02:19:54 +020014107 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014108 }
14109 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014110 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014111 return NULL;
14112}
14113
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014114/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014115
Victor Stinnera47082312012-10-04 02:19:54 +020014116/* Format a float into the writer if the writer is not NULL, or into *p_output
14117 otherwise.
14118
14119 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014120static int
Victor Stinnera47082312012-10-04 02:19:54 +020014121formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14122 PyObject **p_output,
14123 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014124{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014125 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014126 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014127 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014128 int prec;
14129 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014130
Guido van Rossumd57fd912000-03-10 22:53:23 +000014131 x = PyFloat_AsDouble(v);
14132 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014133 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014134
Victor Stinnera47082312012-10-04 02:19:54 +020014135 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014136 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014137 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014138
Victor Stinnera47082312012-10-04 02:19:54 +020014139 if (arg->flags & F_ALT)
14140 dtoa_flags = Py_DTSF_ALT;
14141 else
14142 dtoa_flags = 0;
14143 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014144 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014145 return -1;
14146 len = strlen(p);
14147 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014148 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014149 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014150 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014151 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014152 }
14153 else
14154 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014155 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014156 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014157}
14158
Victor Stinnerd0880d52012-04-27 23:40:13 +020014159/* formatlong() emulates the format codes d, u, o, x and X, and
14160 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14161 * Python's regular ints.
14162 * Return value: a new PyUnicodeObject*, or NULL if error.
14163 * The output string is of the form
14164 * "-"? ("0x" | "0X")? digit+
14165 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14166 * set in flags. The case of hex digits will be correct,
14167 * There will be at least prec digits, zero-filled on the left if
14168 * necessary to get that many.
14169 * val object to be converted
14170 * flags bitmask of format flags; only F_ALT is looked at
14171 * prec minimum number of digits; 0-fill on left if needed
14172 * type a character in [duoxX]; u acts the same as d
14173 *
14174 * CAUTION: o, x and X conversions on regular ints can never
14175 * produce a '-' sign, but can for Python's unbounded ints.
14176 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014177PyObject *
14178_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014179{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014180 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014181 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014182 Py_ssize_t i;
14183 int sign; /* 1 if '-', else 0 */
14184 int len; /* number of characters */
14185 Py_ssize_t llen;
14186 int numdigits; /* len == numnondigits + numdigits */
14187 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014188
Victor Stinnerd0880d52012-04-27 23:40:13 +020014189 /* Avoid exceeding SSIZE_T_MAX */
14190 if (prec > INT_MAX-3) {
14191 PyErr_SetString(PyExc_OverflowError,
14192 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014193 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014194 }
14195
14196 assert(PyLong_Check(val));
14197
14198 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014199 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014200 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014201 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014202 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014203 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014204 /* int and int subclasses should print numerically when a numeric */
14205 /* format code is used (see issue18780) */
14206 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014207 break;
14208 case 'o':
14209 numnondigits = 2;
14210 result = PyNumber_ToBase(val, 8);
14211 break;
14212 case 'x':
14213 case 'X':
14214 numnondigits = 2;
14215 result = PyNumber_ToBase(val, 16);
14216 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014217 }
14218 if (!result)
14219 return NULL;
14220
14221 assert(unicode_modifiable(result));
14222 assert(PyUnicode_IS_READY(result));
14223 assert(PyUnicode_IS_ASCII(result));
14224
14225 /* To modify the string in-place, there can only be one reference. */
14226 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014227 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014228 PyErr_BadInternalCall();
14229 return NULL;
14230 }
14231 buf = PyUnicode_DATA(result);
14232 llen = PyUnicode_GET_LENGTH(result);
14233 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014234 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014235 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014236 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014237 return NULL;
14238 }
14239 len = (int)llen;
14240 sign = buf[0] == '-';
14241 numnondigits += sign;
14242 numdigits = len - numnondigits;
14243 assert(numdigits > 0);
14244
14245 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014246 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014247 (type == 'o' || type == 'x' || type == 'X'))) {
14248 assert(buf[sign] == '0');
14249 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14250 buf[sign+1] == 'o');
14251 numnondigits -= 2;
14252 buf += 2;
14253 len -= 2;
14254 if (sign)
14255 buf[0] = '-';
14256 assert(len == numnondigits + numdigits);
14257 assert(numdigits > 0);
14258 }
14259
14260 /* Fill with leading zeroes to meet minimum width. */
14261 if (prec > numdigits) {
14262 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14263 numnondigits + prec);
14264 char *b1;
14265 if (!r1) {
14266 Py_DECREF(result);
14267 return NULL;
14268 }
14269 b1 = PyBytes_AS_STRING(r1);
14270 for (i = 0; i < numnondigits; ++i)
14271 *b1++ = *buf++;
14272 for (i = 0; i < prec - numdigits; i++)
14273 *b1++ = '0';
14274 for (i = 0; i < numdigits; i++)
14275 *b1++ = *buf++;
14276 *b1 = '\0';
14277 Py_DECREF(result);
14278 result = r1;
14279 buf = PyBytes_AS_STRING(result);
14280 len = numnondigits + prec;
14281 }
14282
14283 /* Fix up case for hex conversions. */
14284 if (type == 'X') {
14285 /* Need to convert all lower case letters to upper case.
14286 and need to convert 0x to 0X (and -0x to -0X). */
14287 for (i = 0; i < len; i++)
14288 if (buf[i] >= 'a' && buf[i] <= 'x')
14289 buf[i] -= 'a'-'A';
14290 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014291 if (!PyUnicode_Check(result)
14292 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014293 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014294 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014295 Py_DECREF(result);
14296 result = unicode;
14297 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014298 else if (len != PyUnicode_GET_LENGTH(result)) {
14299 if (PyUnicode_Resize(&result, len) < 0)
14300 Py_CLEAR(result);
14301 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014302 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014303}
14304
Ethan Furmandf3ed242014-01-05 06:50:30 -080014305/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014306 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014307 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014308 * -1 and raise an exception on error */
14309static int
Victor Stinnera47082312012-10-04 02:19:54 +020014310mainformatlong(PyObject *v,
14311 struct unicode_format_arg_t *arg,
14312 PyObject **p_output,
14313 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014314{
14315 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014316 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014317
14318 if (!PyNumber_Check(v))
14319 goto wrongtype;
14320
Ethan Furman9ab74802014-03-21 06:38:46 -070014321 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014322 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014323 if (type == 'o' || type == 'x' || type == 'X') {
14324 iobj = PyNumber_Index(v);
14325 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014326 if (PyErr_ExceptionMatches(PyExc_TypeError))
14327 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014328 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014329 }
14330 }
14331 else {
14332 iobj = PyNumber_Long(v);
14333 if (iobj == NULL ) {
14334 if (PyErr_ExceptionMatches(PyExc_TypeError))
14335 goto wrongtype;
14336 return -1;
14337 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014338 }
14339 assert(PyLong_Check(iobj));
14340 }
14341 else {
14342 iobj = v;
14343 Py_INCREF(iobj);
14344 }
14345
14346 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014347 && arg->width == -1 && arg->prec == -1
14348 && !(arg->flags & (F_SIGN | F_BLANK))
14349 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014350 {
14351 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014352 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014353 int base;
14354
Victor Stinnera47082312012-10-04 02:19:54 +020014355 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014356 {
14357 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014358 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014359 case 'd':
14360 case 'i':
14361 case 'u':
14362 base = 10;
14363 break;
14364 case 'o':
14365 base = 8;
14366 break;
14367 case 'x':
14368 case 'X':
14369 base = 16;
14370 break;
14371 }
14372
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014373 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14374 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014375 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014376 }
14377 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014378 return 1;
14379 }
14380
Ethan Furmanb95b5612015-01-23 20:05:18 -080014381 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014382 Py_DECREF(iobj);
14383 if (res == NULL)
14384 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014385 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014386 return 0;
14387
14388wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014389 switch(type)
14390 {
14391 case 'o':
14392 case 'x':
14393 case 'X':
14394 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014395 "%%%c format: an integer is required, "
14396 "not %.200s",
14397 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014398 break;
14399 default:
14400 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014401 "%%%c format: a number is required, "
14402 "not %.200s",
14403 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014404 break;
14405 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014406 return -1;
14407}
14408
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014409static Py_UCS4
14410formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014411{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014412 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014413 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014414 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014415 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014416 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014417 goto onError;
14418 }
14419 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014420 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014421 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014422 /* make sure number is a type of integer */
14423 if (!PyLong_Check(v)) {
14424 iobj = PyNumber_Index(v);
14425 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014426 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014427 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014428 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014429 Py_DECREF(iobj);
14430 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014431 else {
14432 x = PyLong_AsLong(v);
14433 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014434 if (x == -1 && PyErr_Occurred())
14435 goto onError;
14436
Victor Stinner8faf8212011-12-08 22:14:11 +010014437 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014438 PyErr_SetString(PyExc_OverflowError,
14439 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014440 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014441 }
14442
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014443 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014444 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014445
Benjamin Peterson29060642009-01-31 22:14:21 +000014446 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014447 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014448 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014449 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014450}
14451
Victor Stinnera47082312012-10-04 02:19:54 +020014452/* Parse options of an argument: flags, width, precision.
14453 Handle also "%(name)" syntax.
14454
14455 Return 0 if the argument has been formatted into arg->str.
14456 Return 1 if the argument has been written into ctx->writer,
14457 Raise an exception and return -1 on error. */
14458static int
14459unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14460 struct unicode_format_arg_t *arg)
14461{
14462#define FORMAT_READ(ctx) \
14463 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14464
14465 PyObject *v;
14466
Victor Stinnera47082312012-10-04 02:19:54 +020014467 if (arg->ch == '(') {
14468 /* Get argument value from a dictionary. Example: "%(name)s". */
14469 Py_ssize_t keystart;
14470 Py_ssize_t keylen;
14471 PyObject *key;
14472 int pcount = 1;
14473
14474 if (ctx->dict == NULL) {
14475 PyErr_SetString(PyExc_TypeError,
14476 "format requires a mapping");
14477 return -1;
14478 }
14479 ++ctx->fmtpos;
14480 --ctx->fmtcnt;
14481 keystart = ctx->fmtpos;
14482 /* Skip over balanced parentheses */
14483 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14484 arg->ch = FORMAT_READ(ctx);
14485 if (arg->ch == ')')
14486 --pcount;
14487 else if (arg->ch == '(')
14488 ++pcount;
14489 ctx->fmtpos++;
14490 }
14491 keylen = ctx->fmtpos - keystart - 1;
14492 if (ctx->fmtcnt < 0 || pcount > 0) {
14493 PyErr_SetString(PyExc_ValueError,
14494 "incomplete format key");
14495 return -1;
14496 }
14497 key = PyUnicode_Substring(ctx->fmtstr,
14498 keystart, keystart + keylen);
14499 if (key == NULL)
14500 return -1;
14501 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014502 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014503 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014504 }
14505 ctx->args = PyObject_GetItem(ctx->dict, key);
14506 Py_DECREF(key);
14507 if (ctx->args == NULL)
14508 return -1;
14509 ctx->args_owned = 1;
14510 ctx->arglen = -1;
14511 ctx->argidx = -2;
14512 }
14513
14514 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014515 while (--ctx->fmtcnt >= 0) {
14516 arg->ch = FORMAT_READ(ctx);
14517 ctx->fmtpos++;
14518 switch (arg->ch) {
14519 case '-': arg->flags |= F_LJUST; continue;
14520 case '+': arg->flags |= F_SIGN; continue;
14521 case ' ': arg->flags |= F_BLANK; continue;
14522 case '#': arg->flags |= F_ALT; continue;
14523 case '0': arg->flags |= F_ZERO; continue;
14524 }
14525 break;
14526 }
14527
14528 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014529 if (arg->ch == '*') {
14530 v = unicode_format_getnextarg(ctx);
14531 if (v == NULL)
14532 return -1;
14533 if (!PyLong_Check(v)) {
14534 PyErr_SetString(PyExc_TypeError,
14535 "* wants int");
14536 return -1;
14537 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014538 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014539 if (arg->width == -1 && PyErr_Occurred())
14540 return -1;
14541 if (arg->width < 0) {
14542 arg->flags |= F_LJUST;
14543 arg->width = -arg->width;
14544 }
14545 if (--ctx->fmtcnt >= 0) {
14546 arg->ch = FORMAT_READ(ctx);
14547 ctx->fmtpos++;
14548 }
14549 }
14550 else if (arg->ch >= '0' && arg->ch <= '9') {
14551 arg->width = arg->ch - '0';
14552 while (--ctx->fmtcnt >= 0) {
14553 arg->ch = FORMAT_READ(ctx);
14554 ctx->fmtpos++;
14555 if (arg->ch < '0' || arg->ch > '9')
14556 break;
14557 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14558 mixing signed and unsigned comparison. Since arg->ch is between
14559 '0' and '9', casting to int is safe. */
14560 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14561 PyErr_SetString(PyExc_ValueError,
14562 "width too big");
14563 return -1;
14564 }
14565 arg->width = arg->width*10 + (arg->ch - '0');
14566 }
14567 }
14568
14569 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014570 if (arg->ch == '.') {
14571 arg->prec = 0;
14572 if (--ctx->fmtcnt >= 0) {
14573 arg->ch = FORMAT_READ(ctx);
14574 ctx->fmtpos++;
14575 }
14576 if (arg->ch == '*') {
14577 v = unicode_format_getnextarg(ctx);
14578 if (v == NULL)
14579 return -1;
14580 if (!PyLong_Check(v)) {
14581 PyErr_SetString(PyExc_TypeError,
14582 "* wants int");
14583 return -1;
14584 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014585 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014586 if (arg->prec == -1 && PyErr_Occurred())
14587 return -1;
14588 if (arg->prec < 0)
14589 arg->prec = 0;
14590 if (--ctx->fmtcnt >= 0) {
14591 arg->ch = FORMAT_READ(ctx);
14592 ctx->fmtpos++;
14593 }
14594 }
14595 else if (arg->ch >= '0' && arg->ch <= '9') {
14596 arg->prec = arg->ch - '0';
14597 while (--ctx->fmtcnt >= 0) {
14598 arg->ch = FORMAT_READ(ctx);
14599 ctx->fmtpos++;
14600 if (arg->ch < '0' || arg->ch > '9')
14601 break;
14602 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14603 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014604 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014605 return -1;
14606 }
14607 arg->prec = arg->prec*10 + (arg->ch - '0');
14608 }
14609 }
14610 }
14611
14612 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14613 if (ctx->fmtcnt >= 0) {
14614 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14615 if (--ctx->fmtcnt >= 0) {
14616 arg->ch = FORMAT_READ(ctx);
14617 ctx->fmtpos++;
14618 }
14619 }
14620 }
14621 if (ctx->fmtcnt < 0) {
14622 PyErr_SetString(PyExc_ValueError,
14623 "incomplete format");
14624 return -1;
14625 }
14626 return 0;
14627
14628#undef FORMAT_READ
14629}
14630
14631/* Format one argument. Supported conversion specifiers:
14632
14633 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014634 - "i", "d", "u": int or float
14635 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014636 - "e", "E", "f", "F", "g", "G": float
14637 - "c": int or str (1 character)
14638
Victor Stinner8dbd4212012-12-04 09:30:24 +010014639 When possible, the output is written directly into the Unicode writer
14640 (ctx->writer). A string is created when padding is required.
14641
Victor Stinnera47082312012-10-04 02:19:54 +020014642 Return 0 if the argument has been formatted into *p_str,
14643 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014644 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014645static int
14646unicode_format_arg_format(struct unicode_formatter_t *ctx,
14647 struct unicode_format_arg_t *arg,
14648 PyObject **p_str)
14649{
14650 PyObject *v;
14651 _PyUnicodeWriter *writer = &ctx->writer;
14652
14653 if (ctx->fmtcnt == 0)
14654 ctx->writer.overallocate = 0;
14655
Victor Stinnera47082312012-10-04 02:19:54 +020014656 v = unicode_format_getnextarg(ctx);
14657 if (v == NULL)
14658 return -1;
14659
Victor Stinnera47082312012-10-04 02:19:54 +020014660
14661 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014662 case 's':
14663 case 'r':
14664 case 'a':
14665 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14666 /* Fast path */
14667 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14668 return -1;
14669 return 1;
14670 }
14671
14672 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14673 *p_str = v;
14674 Py_INCREF(*p_str);
14675 }
14676 else {
14677 if (arg->ch == 's')
14678 *p_str = PyObject_Str(v);
14679 else if (arg->ch == 'r')
14680 *p_str = PyObject_Repr(v);
14681 else
14682 *p_str = PyObject_ASCII(v);
14683 }
14684 break;
14685
14686 case 'i':
14687 case 'd':
14688 case 'u':
14689 case 'o':
14690 case 'x':
14691 case 'X':
14692 {
14693 int ret = mainformatlong(v, arg, p_str, writer);
14694 if (ret != 0)
14695 return ret;
14696 arg->sign = 1;
14697 break;
14698 }
14699
14700 case 'e':
14701 case 'E':
14702 case 'f':
14703 case 'F':
14704 case 'g':
14705 case 'G':
14706 if (arg->width == -1 && arg->prec == -1
14707 && !(arg->flags & (F_SIGN | F_BLANK)))
14708 {
14709 /* Fast path */
14710 if (formatfloat(v, arg, NULL, writer) == -1)
14711 return -1;
14712 return 1;
14713 }
14714
14715 arg->sign = 1;
14716 if (formatfloat(v, arg, p_str, NULL) == -1)
14717 return -1;
14718 break;
14719
14720 case 'c':
14721 {
14722 Py_UCS4 ch = formatchar(v);
14723 if (ch == (Py_UCS4) -1)
14724 return -1;
14725 if (arg->width == -1 && arg->prec == -1) {
14726 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014727 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014728 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014729 return 1;
14730 }
14731 *p_str = PyUnicode_FromOrdinal(ch);
14732 break;
14733 }
14734
14735 default:
14736 PyErr_Format(PyExc_ValueError,
14737 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014738 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014739 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14740 (int)arg->ch,
14741 ctx->fmtpos - 1);
14742 return -1;
14743 }
14744 if (*p_str == NULL)
14745 return -1;
14746 assert (PyUnicode_Check(*p_str));
14747 return 0;
14748}
14749
14750static int
14751unicode_format_arg_output(struct unicode_formatter_t *ctx,
14752 struct unicode_format_arg_t *arg,
14753 PyObject *str)
14754{
14755 Py_ssize_t len;
14756 enum PyUnicode_Kind kind;
14757 void *pbuf;
14758 Py_ssize_t pindex;
14759 Py_UCS4 signchar;
14760 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014761 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014762 Py_ssize_t sublen;
14763 _PyUnicodeWriter *writer = &ctx->writer;
14764 Py_UCS4 fill;
14765
14766 fill = ' ';
14767 if (arg->sign && arg->flags & F_ZERO)
14768 fill = '0';
14769
14770 if (PyUnicode_READY(str) == -1)
14771 return -1;
14772
14773 len = PyUnicode_GET_LENGTH(str);
14774 if ((arg->width == -1 || arg->width <= len)
14775 && (arg->prec == -1 || arg->prec >= len)
14776 && !(arg->flags & (F_SIGN | F_BLANK)))
14777 {
14778 /* Fast path */
14779 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14780 return -1;
14781 return 0;
14782 }
14783
14784 /* Truncate the string for "s", "r" and "a" formats
14785 if the precision is set */
14786 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14787 if (arg->prec >= 0 && len > arg->prec)
14788 len = arg->prec;
14789 }
14790
14791 /* Adjust sign and width */
14792 kind = PyUnicode_KIND(str);
14793 pbuf = PyUnicode_DATA(str);
14794 pindex = 0;
14795 signchar = '\0';
14796 if (arg->sign) {
14797 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14798 if (ch == '-' || ch == '+') {
14799 signchar = ch;
14800 len--;
14801 pindex++;
14802 }
14803 else if (arg->flags & F_SIGN)
14804 signchar = '+';
14805 else if (arg->flags & F_BLANK)
14806 signchar = ' ';
14807 else
14808 arg->sign = 0;
14809 }
14810 if (arg->width < len)
14811 arg->width = len;
14812
14813 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014814 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014815 if (!(arg->flags & F_LJUST)) {
14816 if (arg->sign) {
14817 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014818 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014819 }
14820 else {
14821 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014822 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014823 }
14824 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014825 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14826 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014827 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014828 }
14829
Victor Stinnera47082312012-10-04 02:19:54 +020014830 buflen = arg->width;
14831 if (arg->sign && len == arg->width)
14832 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014833 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014834 return -1;
14835
14836 /* Write the sign if needed */
14837 if (arg->sign) {
14838 if (fill != ' ') {
14839 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14840 writer->pos += 1;
14841 }
14842 if (arg->width > len)
14843 arg->width--;
14844 }
14845
14846 /* Write the numeric prefix for "x", "X" and "o" formats
14847 if the alternate form is used.
14848 For example, write "0x" for the "%#x" format. */
14849 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14850 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14851 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14852 if (fill != ' ') {
14853 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14854 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14855 writer->pos += 2;
14856 pindex += 2;
14857 }
14858 arg->width -= 2;
14859 if (arg->width < 0)
14860 arg->width = 0;
14861 len -= 2;
14862 }
14863
14864 /* Pad left with the fill character if needed */
14865 if (arg->width > len && !(arg->flags & F_LJUST)) {
14866 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014867 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014868 writer->pos += sublen;
14869 arg->width = len;
14870 }
14871
14872 /* If padding with spaces: write sign if needed and/or numeric prefix if
14873 the alternate form is used */
14874 if (fill == ' ') {
14875 if (arg->sign) {
14876 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14877 writer->pos += 1;
14878 }
14879 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14880 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14881 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14882 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14883 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14884 writer->pos += 2;
14885 pindex += 2;
14886 }
14887 }
14888
14889 /* Write characters */
14890 if (len) {
14891 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14892 str, pindex, len);
14893 writer->pos += len;
14894 }
14895
14896 /* Pad right with the fill character if needed */
14897 if (arg->width > len) {
14898 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014899 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014900 writer->pos += sublen;
14901 }
14902 return 0;
14903}
14904
14905/* Helper of PyUnicode_Format(): format one arg.
14906 Return 0 on success, raise an exception and return -1 on error. */
14907static int
14908unicode_format_arg(struct unicode_formatter_t *ctx)
14909{
14910 struct unicode_format_arg_t arg;
14911 PyObject *str;
14912 int ret;
14913
Victor Stinner8dbd4212012-12-04 09:30:24 +010014914 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014915 if (arg.ch == '%') {
14916 ctx->fmtpos++;
14917 ctx->fmtcnt--;
14918 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14919 return -1;
14920 return 0;
14921 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014922 arg.flags = 0;
14923 arg.width = -1;
14924 arg.prec = -1;
14925 arg.sign = 0;
14926 str = NULL;
14927
Victor Stinnera47082312012-10-04 02:19:54 +020014928 ret = unicode_format_arg_parse(ctx, &arg);
14929 if (ret == -1)
14930 return -1;
14931
14932 ret = unicode_format_arg_format(ctx, &arg, &str);
14933 if (ret == -1)
14934 return -1;
14935
14936 if (ret != 1) {
14937 ret = unicode_format_arg_output(ctx, &arg, str);
14938 Py_DECREF(str);
14939 if (ret == -1)
14940 return -1;
14941 }
14942
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014943 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014944 PyErr_SetString(PyExc_TypeError,
14945 "not all arguments converted during string formatting");
14946 return -1;
14947 }
14948 return 0;
14949}
14950
Alexander Belopolsky40018472011-02-26 01:02:56 +000014951PyObject *
14952PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014953{
Victor Stinnera47082312012-10-04 02:19:54 +020014954 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014955
Guido van Rossumd57fd912000-03-10 22:53:23 +000014956 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014957 PyErr_BadInternalCall();
14958 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014959 }
Victor Stinnera47082312012-10-04 02:19:54 +020014960
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014961 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014962 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014963
14964 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014965 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14966 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14967 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14968 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014969
Victor Stinner8f674cc2013-04-17 23:02:17 +020014970 _PyUnicodeWriter_Init(&ctx.writer);
14971 ctx.writer.min_length = ctx.fmtcnt + 100;
14972 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014973
Guido van Rossumd57fd912000-03-10 22:53:23 +000014974 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014975 ctx.arglen = PyTuple_Size(args);
14976 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014977 }
14978 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014979 ctx.arglen = -1;
14980 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014981 }
Victor Stinnera47082312012-10-04 02:19:54 +020014982 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014983 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014984 ctx.dict = args;
14985 else
14986 ctx.dict = NULL;
14987 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014988
Victor Stinnera47082312012-10-04 02:19:54 +020014989 while (--ctx.fmtcnt >= 0) {
14990 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014991 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014992
14993 nonfmtpos = ctx.fmtpos++;
14994 while (ctx.fmtcnt >= 0 &&
14995 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14996 ctx.fmtpos++;
14997 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014998 }
Victor Stinnera47082312012-10-04 02:19:54 +020014999 if (ctx.fmtcnt < 0) {
15000 ctx.fmtpos--;
15001 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015002 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015003
Victor Stinnercfc4c132013-04-03 01:48:39 +020015004 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15005 nonfmtpos, ctx.fmtpos) < 0)
15006 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015007 }
15008 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015009 ctx.fmtpos++;
15010 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015011 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015012 }
15013 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015014
Victor Stinnera47082312012-10-04 02:19:54 +020015015 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015016 PyErr_SetString(PyExc_TypeError,
15017 "not all arguments converted during string formatting");
15018 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015019 }
15020
Victor Stinnera47082312012-10-04 02:19:54 +020015021 if (ctx.args_owned) {
15022 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015023 }
Victor Stinnera47082312012-10-04 02:19:54 +020015024 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015025
Benjamin Peterson29060642009-01-31 22:14:21 +000015026 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015027 _PyUnicodeWriter_Dealloc(&ctx.writer);
15028 if (ctx.args_owned) {
15029 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015030 }
15031 return NULL;
15032}
15033
Jeremy Hylton938ace62002-07-17 16:30:39 +000015034static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015035unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15036
Tim Peters6d6c1a32001-08-02 04:15:00 +000015037static PyObject *
15038unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15039{
Benjamin Peterson29060642009-01-31 22:14:21 +000015040 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015041 static char *kwlist[] = {"object", "encoding", "errors", 0};
15042 char *encoding = NULL;
15043 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015044
Benjamin Peterson14339b62009-01-31 16:36:08 +000015045 if (type != &PyUnicode_Type)
15046 return unicode_subtype_new(type, args, kwds);
15047 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015048 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015049 return NULL;
15050 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015051 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015052 if (encoding == NULL && errors == NULL)
15053 return PyObject_Str(x);
15054 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015055 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015056}
15057
Guido van Rossume023fe02001-08-30 03:12:59 +000015058static PyObject *
15059unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15060{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015061 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015062 Py_ssize_t length, char_size;
15063 int share_wstr, share_utf8;
15064 unsigned int kind;
15065 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015066
Benjamin Peterson14339b62009-01-31 16:36:08 +000015067 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015068
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015069 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015070 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015071 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015072 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015073 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015074 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015075 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015076 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015077
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015078 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015079 if (self == NULL) {
15080 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015081 return NULL;
15082 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015083 kind = PyUnicode_KIND(unicode);
15084 length = PyUnicode_GET_LENGTH(unicode);
15085
15086 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015087#ifdef Py_DEBUG
15088 _PyUnicode_HASH(self) = -1;
15089#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015090 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015091#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015092 _PyUnicode_STATE(self).interned = 0;
15093 _PyUnicode_STATE(self).kind = kind;
15094 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015095 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015096 _PyUnicode_STATE(self).ready = 1;
15097 _PyUnicode_WSTR(self) = NULL;
15098 _PyUnicode_UTF8_LENGTH(self) = 0;
15099 _PyUnicode_UTF8(self) = NULL;
15100 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015101 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015102
15103 share_utf8 = 0;
15104 share_wstr = 0;
15105 if (kind == PyUnicode_1BYTE_KIND) {
15106 char_size = 1;
15107 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15108 share_utf8 = 1;
15109 }
15110 else if (kind == PyUnicode_2BYTE_KIND) {
15111 char_size = 2;
15112 if (sizeof(wchar_t) == 2)
15113 share_wstr = 1;
15114 }
15115 else {
15116 assert(kind == PyUnicode_4BYTE_KIND);
15117 char_size = 4;
15118 if (sizeof(wchar_t) == 4)
15119 share_wstr = 1;
15120 }
15121
15122 /* Ensure we won't overflow the length. */
15123 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15124 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015125 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015126 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015127 data = PyObject_MALLOC((length + 1) * char_size);
15128 if (data == NULL) {
15129 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015130 goto onError;
15131 }
15132
Victor Stinnerc3c74152011-10-02 20:39:55 +020015133 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015134 if (share_utf8) {
15135 _PyUnicode_UTF8_LENGTH(self) = length;
15136 _PyUnicode_UTF8(self) = data;
15137 }
15138 if (share_wstr) {
15139 _PyUnicode_WSTR_LENGTH(self) = length;
15140 _PyUnicode_WSTR(self) = (wchar_t *)data;
15141 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015142
Christian Heimesf051e432016-09-13 20:22:02 +020015143 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015144 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015145 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015146#ifdef Py_DEBUG
15147 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15148#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015149 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015150 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015151
15152onError:
15153 Py_DECREF(unicode);
15154 Py_DECREF(self);
15155 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015156}
15157
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015158PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015159"str(object='') -> str\n\
15160str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015161\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015162Create a new string object from the given object. If encoding or\n\
15163errors is specified, then the object must expose a data buffer\n\
15164that will be decoded using the given encoding and error handler.\n\
15165Otherwise, returns the result of object.__str__() (if defined)\n\
15166or repr(object).\n\
15167encoding defaults to sys.getdefaultencoding().\n\
15168errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015169
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015170static PyObject *unicode_iter(PyObject *seq);
15171
Guido van Rossumd57fd912000-03-10 22:53:23 +000015172PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015173 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015174 "str", /* tp_name */
15175 sizeof(PyUnicodeObject), /* tp_basicsize */
15176 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015177 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015178 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015179 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015180 0, /* tp_getattr */
15181 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015182 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015183 unicode_repr, /* tp_repr */
15184 &unicode_as_number, /* tp_as_number */
15185 &unicode_as_sequence, /* tp_as_sequence */
15186 &unicode_as_mapping, /* tp_as_mapping */
15187 (hashfunc) unicode_hash, /* tp_hash*/
15188 0, /* tp_call*/
15189 (reprfunc) unicode_str, /* tp_str */
15190 PyObject_GenericGetAttr, /* tp_getattro */
15191 0, /* tp_setattro */
15192 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015193 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015194 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15195 unicode_doc, /* tp_doc */
15196 0, /* tp_traverse */
15197 0, /* tp_clear */
15198 PyUnicode_RichCompare, /* tp_richcompare */
15199 0, /* tp_weaklistoffset */
15200 unicode_iter, /* tp_iter */
15201 0, /* tp_iternext */
15202 unicode_methods, /* tp_methods */
15203 0, /* tp_members */
15204 0, /* tp_getset */
15205 &PyBaseObject_Type, /* tp_base */
15206 0, /* tp_dict */
15207 0, /* tp_descr_get */
15208 0, /* tp_descr_set */
15209 0, /* tp_dictoffset */
15210 0, /* tp_init */
15211 0, /* tp_alloc */
15212 unicode_new, /* tp_new */
15213 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015214};
15215
15216/* Initialize the Unicode implementation */
15217
Victor Stinner331a6a52019-05-27 16:39:22 +020015218PyStatus
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015219_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015220{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015221 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015222 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015223 0x000A, /* LINE FEED */
15224 0x000D, /* CARRIAGE RETURN */
15225 0x001C, /* FILE SEPARATOR */
15226 0x001D, /* GROUP SEPARATOR */
15227 0x001E, /* RECORD SEPARATOR */
15228 0x0085, /* NEXT LINE */
15229 0x2028, /* LINE SEPARATOR */
15230 0x2029, /* PARAGRAPH SEPARATOR */
15231 };
15232
Fred Drakee4315f52000-05-09 19:53:39 +000015233 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015234 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015235 if (!unicode_empty) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015236 return _PyStatus_ERR("Can't create empty string");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015237 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015238 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015239
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015240 if (PyType_Ready(&PyUnicode_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015241 return _PyStatus_ERR("Can't initialize unicode type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015242 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015243
15244 /* initialize the linebreak bloom filter */
15245 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015246 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015247 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015248
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015249 if (PyType_Ready(&EncodingMapType) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015250 return _PyStatus_ERR("Can't initialize encoding map type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015251 }
15252 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015253 return _PyStatus_ERR("Can't initialize field name iterator type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015254 }
15255 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015256 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015257 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015258 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015259}
15260
15261/* Finalize the Unicode implementation */
15262
Christian Heimesa156e092008-02-16 07:38:31 +000015263int
15264PyUnicode_ClearFreeList(void)
15265{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015266 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015267}
15268
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015269
Walter Dörwald16807132007-05-25 13:52:07 +000015270void
15271PyUnicode_InternInPlace(PyObject **p)
15272{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015273 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015274 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015275#ifdef Py_DEBUG
15276 assert(s != NULL);
15277 assert(_PyUnicode_CHECK(s));
15278#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015279 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015280 return;
15281#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015282 /* If it's a subclass, we don't really know what putting
15283 it in the interned dict might do. */
15284 if (!PyUnicode_CheckExact(s))
15285 return;
15286 if (PyUnicode_CHECK_INTERNED(s))
15287 return;
15288 if (interned == NULL) {
15289 interned = PyDict_New();
15290 if (interned == NULL) {
15291 PyErr_Clear(); /* Don't leave an exception */
15292 return;
15293 }
15294 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015295 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015296 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015297 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015298 if (t == NULL) {
15299 PyErr_Clear();
15300 return;
15301 }
15302 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015303 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015304 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015305 return;
15306 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015307 /* The two references in interned are not counted by refcnt.
15308 The deallocator will take care of this */
15309 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015310 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015311}
15312
15313void
15314PyUnicode_InternImmortal(PyObject **p)
15315{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015316 PyUnicode_InternInPlace(p);
15317 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015318 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015319 Py_INCREF(*p);
15320 }
Walter Dörwald16807132007-05-25 13:52:07 +000015321}
15322
15323PyObject *
15324PyUnicode_InternFromString(const char *cp)
15325{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015326 PyObject *s = PyUnicode_FromString(cp);
15327 if (s == NULL)
15328 return NULL;
15329 PyUnicode_InternInPlace(&s);
15330 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015331}
15332
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015333
15334#if defined(WITH_VALGRIND) || defined(__INSURE__)
15335static void
15336unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015337{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015338 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015339 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015340 Py_ssize_t i, n;
15341 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015342
Benjamin Peterson14339b62009-01-31 16:36:08 +000015343 if (interned == NULL || !PyDict_Check(interned))
15344 return;
15345 keys = PyDict_Keys(interned);
15346 if (keys == NULL || !PyList_Check(keys)) {
15347 PyErr_Clear();
15348 return;
15349 }
Walter Dörwald16807132007-05-25 13:52:07 +000015350
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015351 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015352 detector, interned unicode strings are not forcibly deallocated;
15353 rather, we give them their stolen references back, and then clear
15354 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015355
Benjamin Peterson14339b62009-01-31 16:36:08 +000015356 n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015357#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015358 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015359 n);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015360#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015361 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015362 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015363 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015364 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015365 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015366 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015367 case SSTATE_NOT_INTERNED:
15368 /* XXX Shouldn't happen */
15369 break;
15370 case SSTATE_INTERNED_IMMORTAL:
15371 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015372 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015373 break;
15374 case SSTATE_INTERNED_MORTAL:
15375 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015376 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015377 break;
15378 default:
15379 Py_FatalError("Inconsistent interned string state.");
15380 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015381 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015382 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015383#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015384 fprintf(stderr, "total size of all interned strings: "
15385 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15386 "mortal/immortal\n", mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015387#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015388 Py_DECREF(keys);
15389 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015390 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015391}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015392#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015393
15394
15395/********************* Unicode Iterator **************************/
15396
15397typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015398 PyObject_HEAD
15399 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015400 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015401} unicodeiterobject;
15402
15403static void
15404unicodeiter_dealloc(unicodeiterobject *it)
15405{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015406 _PyObject_GC_UNTRACK(it);
15407 Py_XDECREF(it->it_seq);
15408 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015409}
15410
15411static int
15412unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15413{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015414 Py_VISIT(it->it_seq);
15415 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015416}
15417
15418static PyObject *
15419unicodeiter_next(unicodeiterobject *it)
15420{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015421 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015422
Benjamin Peterson14339b62009-01-31 16:36:08 +000015423 assert(it != NULL);
15424 seq = it->it_seq;
15425 if (seq == NULL)
15426 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015427 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015429 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15430 int kind = PyUnicode_KIND(seq);
15431 void *data = PyUnicode_DATA(seq);
15432 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15433 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015434 if (item != NULL)
15435 ++it->it_index;
15436 return item;
15437 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015438
Benjamin Peterson14339b62009-01-31 16:36:08 +000015439 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015440 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015441 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015442}
15443
15444static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015445unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015446{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015447 Py_ssize_t len = 0;
15448 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015449 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015450 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015451}
15452
15453PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15454
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015455static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015456unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015457{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015458 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015459 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015460 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015461 it->it_seq, it->it_index);
15462 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015463 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015464 if (u == NULL)
15465 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015466 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015467 }
15468}
15469
15470PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15471
15472static PyObject *
15473unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15474{
15475 Py_ssize_t index = PyLong_AsSsize_t(state);
15476 if (index == -1 && PyErr_Occurred())
15477 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015478 if (it->it_seq != NULL) {
15479 if (index < 0)
15480 index = 0;
15481 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15482 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15483 it->it_index = index;
15484 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015485 Py_RETURN_NONE;
15486}
15487
15488PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15489
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015490static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015491 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015492 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015493 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15494 reduce_doc},
15495 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15496 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015497 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015498};
15499
15500PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015501 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15502 "str_iterator", /* tp_name */
15503 sizeof(unicodeiterobject), /* tp_basicsize */
15504 0, /* tp_itemsize */
15505 /* methods */
15506 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015507 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015508 0, /* tp_getattr */
15509 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015510 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015511 0, /* tp_repr */
15512 0, /* tp_as_number */
15513 0, /* tp_as_sequence */
15514 0, /* tp_as_mapping */
15515 0, /* tp_hash */
15516 0, /* tp_call */
15517 0, /* tp_str */
15518 PyObject_GenericGetAttr, /* tp_getattro */
15519 0, /* tp_setattro */
15520 0, /* tp_as_buffer */
15521 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15522 0, /* tp_doc */
15523 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15524 0, /* tp_clear */
15525 0, /* tp_richcompare */
15526 0, /* tp_weaklistoffset */
15527 PyObject_SelfIter, /* tp_iter */
15528 (iternextfunc)unicodeiter_next, /* tp_iternext */
15529 unicodeiter_methods, /* tp_methods */
15530 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015531};
15532
15533static PyObject *
15534unicode_iter(PyObject *seq)
15535{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015536 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015537
Benjamin Peterson14339b62009-01-31 16:36:08 +000015538 if (!PyUnicode_Check(seq)) {
15539 PyErr_BadInternalCall();
15540 return NULL;
15541 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015542 if (PyUnicode_READY(seq) == -1)
15543 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015544 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15545 if (it == NULL)
15546 return NULL;
15547 it->it_index = 0;
15548 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015549 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015550 _PyObject_GC_TRACK(it);
15551 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015552}
15553
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015554
15555size_t
15556Py_UNICODE_strlen(const Py_UNICODE *u)
15557{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015558 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015559}
15560
15561Py_UNICODE*
15562Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15563{
15564 Py_UNICODE *u = s1;
15565 while ((*u++ = *s2++));
15566 return s1;
15567}
15568
15569Py_UNICODE*
15570Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15571{
15572 Py_UNICODE *u = s1;
15573 while ((*u++ = *s2++))
15574 if (n-- == 0)
15575 break;
15576 return s1;
15577}
15578
15579Py_UNICODE*
15580Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15581{
15582 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015583 u1 += wcslen(u1);
15584 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015585 return s1;
15586}
15587
15588int
15589Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15590{
15591 while (*s1 && *s2 && *s1 == *s2)
15592 s1++, s2++;
15593 if (*s1 && *s2)
15594 return (*s1 < *s2) ? -1 : +1;
15595 if (*s1)
15596 return 1;
15597 if (*s2)
15598 return -1;
15599 return 0;
15600}
15601
15602int
15603Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15604{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015605 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015606 for (; n != 0; n--) {
15607 u1 = *s1;
15608 u2 = *s2;
15609 if (u1 != u2)
15610 return (u1 < u2) ? -1 : +1;
15611 if (u1 == '\0')
15612 return 0;
15613 s1++;
15614 s2++;
15615 }
15616 return 0;
15617}
15618
15619Py_UNICODE*
15620Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15621{
15622 const Py_UNICODE *p;
15623 for (p = s; *p; p++)
15624 if (*p == c)
15625 return (Py_UNICODE*)p;
15626 return NULL;
15627}
15628
15629Py_UNICODE*
15630Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15631{
15632 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015633 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015634 while (p != s) {
15635 p--;
15636 if (*p == c)
15637 return (Py_UNICODE*)p;
15638 }
15639 return NULL;
15640}
Victor Stinner331ea922010-08-10 16:37:20 +000015641
Victor Stinner71133ff2010-09-01 23:43:53 +000015642Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015643PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015644{
Victor Stinner577db2c2011-10-11 22:12:48 +020015645 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015646 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015648 if (!PyUnicode_Check(unicode)) {
15649 PyErr_BadArgument();
15650 return NULL;
15651 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015652 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015653 if (u == NULL)
15654 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015655 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015656 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015657 PyErr_NoMemory();
15658 return NULL;
15659 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015660 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015661 size *= sizeof(Py_UNICODE);
15662 copy = PyMem_Malloc(size);
15663 if (copy == NULL) {
15664 PyErr_NoMemory();
15665 return NULL;
15666 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015667 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015668 return copy;
15669}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015670
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015671
Victor Stinner709d23d2019-05-02 14:56:30 -040015672static int
15673encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015674{
Victor Stinner709d23d2019-05-02 14:56:30 -040015675 int res;
15676 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15677 if (res == -2) {
15678 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15679 return -1;
15680 }
15681 if (res < 0) {
15682 PyErr_NoMemory();
15683 return -1;
15684 }
15685 return 0;
15686}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015687
Victor Stinner709d23d2019-05-02 14:56:30 -040015688
15689static int
15690config_get_codec_name(wchar_t **config_encoding)
15691{
15692 char *encoding;
15693 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15694 return -1;
15695 }
15696
15697 PyObject *name_obj = NULL;
15698 PyObject *codec = _PyCodec_Lookup(encoding);
15699 PyMem_RawFree(encoding);
15700
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015701 if (!codec)
15702 goto error;
15703
15704 name_obj = PyObject_GetAttrString(codec, "name");
15705 Py_CLEAR(codec);
15706 if (!name_obj) {
15707 goto error;
15708 }
15709
Victor Stinner709d23d2019-05-02 14:56:30 -040015710 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15711 Py_DECREF(name_obj);
15712 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015713 goto error;
15714 }
15715
Victor Stinner709d23d2019-05-02 14:56:30 -040015716 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15717 if (raw_wname == NULL) {
15718 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015719 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040015720 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015721 }
Victor Stinner709d23d2019-05-02 14:56:30 -040015722
15723 PyMem_RawFree(*config_encoding);
15724 *config_encoding = raw_wname;
15725
15726 PyMem_Free(wname);
15727 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015728
15729error:
15730 Py_XDECREF(codec);
15731 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040015732 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015733}
15734
15735
Victor Stinner331a6a52019-05-27 16:39:22 +020015736static PyStatus
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015737init_stdio_encoding(PyInterpreterState *interp)
15738{
Victor Stinner709d23d2019-05-02 14:56:30 -040015739 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinner331a6a52019-05-27 16:39:22 +020015740 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015741 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015742 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015743 "of the stdio encoding");
15744 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015745 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015746}
15747
15748
Victor Stinner709d23d2019-05-02 14:56:30 -040015749static int
15750init_fs_codec(PyInterpreterState *interp)
15751{
Victor Stinner331a6a52019-05-27 16:39:22 +020015752 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015753
15754 _Py_error_handler error_handler;
15755 error_handler = get_error_handler_wide(config->filesystem_errors);
15756 if (error_handler == _Py_ERROR_UNKNOWN) {
15757 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
15758 return -1;
15759 }
15760
15761 char *encoding, *errors;
15762 if (encode_wstr_utf8(config->filesystem_encoding,
15763 &encoding,
15764 "filesystem_encoding") < 0) {
15765 return -1;
15766 }
15767
15768 if (encode_wstr_utf8(config->filesystem_errors,
15769 &errors,
15770 "filesystem_errors") < 0) {
15771 PyMem_RawFree(encoding);
15772 return -1;
15773 }
15774
15775 PyMem_RawFree(interp->fs_codec.encoding);
15776 interp->fs_codec.encoding = encoding;
15777 PyMem_RawFree(interp->fs_codec.errors);
15778 interp->fs_codec.errors = errors;
15779 interp->fs_codec.error_handler = error_handler;
15780
15781 /* At this point, PyUnicode_EncodeFSDefault() and
15782 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15783 the C implementation of the filesystem encoding. */
15784
15785 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15786 global configuration variables. */
15787 if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,
15788 interp->fs_codec.errors) < 0) {
15789 PyErr_NoMemory();
15790 return -1;
15791 }
15792 return 0;
15793}
15794
15795
Victor Stinner331a6a52019-05-27 16:39:22 +020015796static PyStatus
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015797init_fs_encoding(PyInterpreterState *interp)
15798{
Victor Stinner709d23d2019-05-02 14:56:30 -040015799 /* Update the filesystem encoding to the normalized Python codec name.
15800 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15801 (Python codec name). */
Victor Stinner331a6a52019-05-27 16:39:22 +020015802 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015803 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015804 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015805 "of the filesystem encoding");
15806 }
15807
Victor Stinner709d23d2019-05-02 14:56:30 -040015808 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015809 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015810 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015811 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015812}
15813
15814
Victor Stinner331a6a52019-05-27 16:39:22 +020015815PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020015816_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015817{
Victor Stinnerb45d2592019-06-20 00:05:23 +020015818 PyInterpreterState *interp = tstate->interp;
15819
Victor Stinner331a6a52019-05-27 16:39:22 +020015820 PyStatus status = init_fs_encoding(interp);
15821 if (_PyStatus_EXCEPTION(status)) {
15822 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015823 }
15824
15825 return init_stdio_encoding(interp);
15826}
15827
15828
Victor Stinner709d23d2019-05-02 14:56:30 -040015829#ifdef MS_WINDOWS
15830int
15831_PyUnicode_EnableLegacyWindowsFSEncoding(void)
15832{
15833 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinner331a6a52019-05-27 16:39:22 +020015834 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015835
15836 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
15837 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
15838 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
15839 if (encoding == NULL || errors == NULL) {
15840 PyMem_RawFree(encoding);
15841 PyMem_RawFree(errors);
15842 PyErr_NoMemory();
15843 return -1;
15844 }
15845
15846 PyMem_RawFree(config->filesystem_encoding);
15847 config->filesystem_encoding = encoding;
15848 PyMem_RawFree(config->filesystem_errors);
15849 config->filesystem_errors = errors;
15850
15851 return init_fs_codec(interp);
15852}
15853#endif
15854
15855
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015856void
15857_PyUnicode_Fini(void)
15858{
15859#if defined(WITH_VALGRIND) || defined(__INSURE__)
15860 /* Insure++ is a memory analysis tool that aids in discovering
15861 * memory leaks and other memory problems. On Python exit, the
15862 * interned string dictionaries are flagged as being in use at exit
15863 * (which it is). Under normal circumstances, this is fine because
15864 * the memory will be automatically reclaimed by the system. Under
15865 * memory debugging, it's a huge source of useless noise, so we
15866 * trade off slower shutdown for less distraction in the memory
15867 * reports. -baw
15868 */
15869 unicode_release_interned();
15870#endif /* __INSURE__ */
15871
15872 Py_CLEAR(unicode_empty);
15873
15874 for (Py_ssize_t i = 0; i < 256; i++) {
15875 Py_CLEAR(unicode_latin1[i]);
15876 }
15877 _PyUnicode_ClearStaticStrings();
15878 (void)PyUnicode_ClearFreeList();
Victor Stinner709d23d2019-05-02 14:56:30 -040015879
15880 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
15881 PyMem_RawFree(interp->fs_codec.encoding);
15882 interp->fs_codec.encoding = NULL;
15883 PyMem_RawFree(interp->fs_codec.errors);
15884 interp->fs_codec.errors = NULL;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015885}
15886
15887
Georg Brandl66c221e2010-10-14 07:04:07 +000015888/* A _string module, to export formatter_parser and formatter_field_name_split
15889 to the string.Formatter class implemented in Python. */
15890
15891static PyMethodDef _string_methods[] = {
15892 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15893 METH_O, PyDoc_STR("split the argument as a field name")},
15894 {"formatter_parser", (PyCFunction) formatter_parser,
15895 METH_O, PyDoc_STR("parse the argument as a format string")},
15896 {NULL, NULL}
15897};
15898
15899static struct PyModuleDef _string_module = {
15900 PyModuleDef_HEAD_INIT,
15901 "_string",
15902 PyDoc_STR("string helper module"),
15903 0,
15904 _string_methods,
15905 NULL,
15906 NULL,
15907 NULL,
15908 NULL
15909};
15910
15911PyMODINIT_FUNC
15912PyInit__string(void)
15913{
15914 return PyModule_Create(&_string_module);
15915}
15916
15917
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015918#ifdef __cplusplus
15919}
15920#endif