blob: f6bf505b7fc7480a62936be01e026927e7658871 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner47e1afd2020-10-26 16:43:47 +010043#include "pycore_abstract.h" // _PyIndex_Check()
Victor Stinnerba3d67c2020-12-26 00:41:46 +010044#include "pycore_atomic_funcs.h" // _Py_atomic_size_get()
Victor Stinner47e1afd2020-10-26 16:43:47 +010045#include "pycore_bytes_methods.h" // _Py_bytes_lower()
Serhiy Storchaka2ad93822020-12-03 12:46:16 +020046#include "pycore_format.h" // F_LJUST
Victor Stinner47e1afd2020-10-26 16:43:47 +010047#include "pycore_initconfig.h" // _PyStatus_OK()
48#include "pycore_interp.h" // PyInterpreterState.fs_codec
49#include "pycore_object.h" // _PyObject_GC_TRACK()
50#include "pycore_pathconfig.h" // _Py_DumpPathConfig()
51#include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding()
52#include "pycore_pystate.h" // _PyInterpreterState_GET()
53#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
54#include "stringlib/eq.h" // unicode_eq()
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000056#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000057#include <windows.h>
58#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059
Victor Stinner666ecfb2020-07-02 01:19:57 +020060/* Uncomment to display statistics on interned strings at exit
61 in _PyUnicode_ClearInterned(). */
Victor Stinnerfecc4f22019-03-19 14:20:29 +010062/* #define INTERNED_STATS 1 */
63
64
Larry Hastings61272b72014-01-07 12:41:53 -080065/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090066class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080067[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090068/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
69
70/*[python input]
71class Py_UCS4_converter(CConverter):
72 type = 'Py_UCS4'
73 converter = 'convert_uc'
74
75 def converter_init(self):
76 if self.default is not unspecified:
77 self.c_default = ascii(self.default)
78 if len(self.c_default) > 4 or self.c_default[0] != "'":
79 self.c_default = hex(ord(self.default))
80
81[python start generated code]*/
82/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080083
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
Serhiy Storchaka05997252013-01-26 12:14:02 +020086NOTE: In the interpreter's initialization phase, some globals are currently
87 initialized dynamically as needed. In the process Unicode objects may
88 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000089
90*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000091
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000092
93#ifdef __cplusplus
94extern "C" {
95#endif
96
Victor Stinner99768342021-03-17 21:46:53 +010097// Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
98// The value must be the same in fileutils.c.
Victor Stinner8faf8212011-12-08 22:14:11 +010099#define MAX_UNICODE 0x10ffff
100
Victor Stinner910337b2011-10-03 03:20:16 +0200101#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200102# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +0200103#else
104# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
105#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200106
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107#define _PyUnicode_UTF8(op) \
108 (((PyCompactUnicodeObject*)(op))->utf8)
109#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200110 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 assert(PyUnicode_IS_READY(op)), \
112 PyUnicode_IS_COMPACT_ASCII(op) ? \
113 ((char*)((PyASCIIObject*)(op) + 1)) : \
114 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200115#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200116 (((PyCompactUnicodeObject*)(op))->utf8_length)
117#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200118 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200119 assert(PyUnicode_IS_READY(op)), \
120 PyUnicode_IS_COMPACT_ASCII(op) ? \
121 ((PyASCIIObject*)(op))->length : \
122 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200123#define _PyUnicode_WSTR(op) \
124 (((PyASCIIObject*)(op))->wstr)
Inada Naoki2c4928d2020-06-17 20:09:44 +0900125
126/* Don't use deprecated macro of unicodeobject.h */
127#undef PyUnicode_WSTR_LENGTH
128#define PyUnicode_WSTR_LENGTH(op) \
129 (PyUnicode_IS_COMPACT_ASCII(op) ? \
130 ((PyASCIIObject*)op)->length : \
131 ((PyCompactUnicodeObject*)op)->wstr_length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200132#define _PyUnicode_WSTR_LENGTH(op) \
133 (((PyCompactUnicodeObject*)(op))->wstr_length)
134#define _PyUnicode_LENGTH(op) \
135 (((PyASCIIObject *)(op))->length)
136#define _PyUnicode_STATE(op) \
137 (((PyASCIIObject *)(op))->state)
138#define _PyUnicode_HASH(op) \
139 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200140#define _PyUnicode_KIND(op) \
141 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200142 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200143#define _PyUnicode_GET_LENGTH(op) \
144 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200145 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200146#define _PyUnicode_DATA_ANY(op) \
147 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200148
Victor Stinner910337b2011-10-03 03:20:16 +0200149#undef PyUnicode_READY
150#define PyUnicode_READY(op) \
151 (assert(_PyUnicode_CHECK(op)), \
152 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200153 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100154 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200155
Victor Stinnerc379ead2011-10-03 12:52:27 +0200156#define _PyUnicode_SHARE_UTF8(op) \
157 (assert(_PyUnicode_CHECK(op)), \
158 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
159 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
160#define _PyUnicode_SHARE_WSTR(op) \
161 (assert(_PyUnicode_CHECK(op)), \
162 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
163
Victor Stinner829c0ad2011-10-03 01:08:02 +0200164/* true if the Unicode object has an allocated UTF-8 memory block
165 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200166#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200167 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200168 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200169 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
170
Victor Stinner03490912011-10-03 23:45:12 +0200171/* true if the Unicode object has an allocated wstr memory block
172 (not shared with other data) */
173#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200174 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200175 (!PyUnicode_IS_READY(op) || \
176 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
177
Victor Stinner910337b2011-10-03 03:20:16 +0200178/* Generic helper macro to convert characters of different types.
179 from_type and to_type have to be valid type names, begin and end
180 are pointers to the source characters which should be of type
181 "from_type *". to is a pointer of type "to_type *" and points to the
182 buffer where the result characters are written to. */
183#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
184 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100185 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600186 const from_type *_iter = (const from_type *)(begin);\
187 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200188 Py_ssize_t n = (_end) - (_iter); \
189 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200190 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200191 while (_iter < (_unrolled_end)) { \
192 _to[0] = (to_type) _iter[0]; \
193 _to[1] = (to_type) _iter[1]; \
194 _to[2] = (to_type) _iter[2]; \
195 _to[3] = (to_type) _iter[3]; \
196 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200197 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200198 while (_iter < (_end)) \
199 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200200 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200201
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200202#ifdef MS_WINDOWS
203 /* On Windows, overallocate by 50% is the best factor */
204# define OVERALLOCATE_FACTOR 2
205#else
206 /* On Linux, overallocate by 25% is the best factor */
207# define OVERALLOCATE_FACTOR 4
208#endif
209
Walter Dörwald16807132007-05-25 13:52:07 +0000210
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200211static struct _Py_unicode_state*
212get_unicode_state(void)
213{
214 PyInterpreterState *interp = _PyInterpreterState_GET();
215 return &interp->unicode;
216}
Serhiy Storchaka05997252013-01-26 12:14:02 +0200217
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000218
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200219// Return a borrowed reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200220static inline PyObject* unicode_get_empty(void)
221{
222 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner90ed8a62020-06-24 00:34:07 +0200223 // unicode_get_empty() must not be called before _PyUnicode_Init()
224 // or after _PyUnicode_Fini()
Victor Stinner91698d82020-06-25 14:07:40 +0200225 assert(state->empty_string != NULL);
226 return state->empty_string;
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200227}
228
Victor Stinner91698d82020-06-25 14:07:40 +0200229
230// Return a strong reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200231static inline PyObject* unicode_new_empty(void)
232{
Victor Stinner90ed8a62020-06-24 00:34:07 +0200233 PyObject *empty = unicode_get_empty();
234 Py_INCREF(empty);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200235 return empty;
236}
237
238#define _Py_RETURN_UNICODE_EMPTY() \
239 do { \
240 return unicode_new_empty(); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200241 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000242
Victor Stinner59423e32018-11-26 13:40:01 +0100243static inline void
244unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
245 Py_ssize_t start, Py_ssize_t length)
246{
247 assert(0 <= start);
248 assert(kind != PyUnicode_WCHAR_KIND);
249 switch (kind) {
250 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100251 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100252 Py_UCS1 ch = (unsigned char)value;
253 Py_UCS1 *to = (Py_UCS1 *)data + start;
254 memset(to, ch, length);
255 break;
256 }
257 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100258 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100259 Py_UCS2 ch = (Py_UCS2)value;
260 Py_UCS2 *to = (Py_UCS2 *)data + start;
261 const Py_UCS2 *end = to + length;
262 for (; to < end; ++to) *to = ch;
263 break;
264 }
265 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100266 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100267 Py_UCS4 ch = value;
268 Py_UCS4 * to = (Py_UCS4 *)data + start;
269 const Py_UCS4 *end = to + length;
270 for (; to < end; ++to) *to = ch;
271 break;
272 }
273 default: Py_UNREACHABLE();
274 }
275}
276
277
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200278/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700279static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200280_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900281static inline void
282_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400283static PyObject *
284unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
285 const char *errors);
286static PyObject *
287unicode_decode_utf8(const char *s, Py_ssize_t size,
288 _Py_error_handler error_handler, const char *errors,
289 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200290
Christian Heimes190d79e2008-01-30 11:58:22 +0000291/* Fast detection of the most frequent whitespace characters */
292const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000293 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000294/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000295/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000296/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000297/* case 0x000C: * FORM FEED */
298/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 0, 1, 1, 1, 1, 1, 0, 0,
300 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000301/* case 0x001C: * FILE SEPARATOR */
302/* case 0x001D: * GROUP SEPARATOR */
303/* case 0x001E: * RECORD SEPARATOR */
304/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000305 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000306/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000307 1, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000311
Benjamin Peterson14339b62009-01-31 16:36:08 +0000312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0,
315 0, 0, 0, 0, 0, 0, 0, 0,
316 0, 0, 0, 0, 0, 0, 0, 0,
317 0, 0, 0, 0, 0, 0, 0, 0,
318 0, 0, 0, 0, 0, 0, 0, 0,
319 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000320};
321
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200322/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200323static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200324static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100325static int unicode_modifiable(PyObject *unicode);
326
Victor Stinnerfe226c02011-10-03 03:52:20 +0200327
Alexander Belopolsky40018472011-02-26 01:02:56 +0000328static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100329_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200330static PyObject *
331_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
332static PyObject *
333_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
334
335static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000336unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000337 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100338 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000339 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
340
Alexander Belopolsky40018472011-02-26 01:02:56 +0000341static void
342raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300343 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100344 PyObject *unicode,
345 Py_ssize_t startpos, Py_ssize_t endpos,
346 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000347
Christian Heimes190d79e2008-01-30 11:58:22 +0000348/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200349static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000350 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000351/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000352/* 0x000B, * LINE TABULATION */
353/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000354/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000355 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000356 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000357/* 0x001C, * FILE SEPARATOR */
358/* 0x001D, * GROUP SEPARATOR */
359/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000360 0, 0, 0, 0, 1, 1, 1, 0,
361 0, 0, 0, 0, 0, 0, 0, 0,
362 0, 0, 0, 0, 0, 0, 0, 0,
363 0, 0, 0, 0, 0, 0, 0, 0,
364 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000365
Benjamin Peterson14339b62009-01-31 16:36:08 +0000366 0, 0, 0, 0, 0, 0, 0, 0,
367 0, 0, 0, 0, 0, 0, 0, 0,
368 0, 0, 0, 0, 0, 0, 0, 0,
369 0, 0, 0, 0, 0, 0, 0, 0,
370 0, 0, 0, 0, 0, 0, 0, 0,
371 0, 0, 0, 0, 0, 0, 0, 0,
372 0, 0, 0, 0, 0, 0, 0, 0,
373 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000374};
375
INADA Naoki3ae20562017-01-16 20:41:20 +0900376static int convert_uc(PyObject *obj, void *addr);
377
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300378#include "clinic/unicodeobject.c.h"
379
Victor Stinner3d4226a2018-08-29 22:21:32 +0200380_Py_error_handler
381_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200382{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200383 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200384 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200385 }
386 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200387 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200388 }
389 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200390 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200391 }
392 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200393 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200394 }
395 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200396 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200397 }
398 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200399 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200400 }
401 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200402 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200403 }
Victor Stinner50149202015-09-22 00:26:54 +0200404 return _Py_ERROR_OTHER;
405}
406
Victor Stinner709d23d2019-05-02 14:56:30 -0400407
408static _Py_error_handler
409get_error_handler_wide(const wchar_t *errors)
410{
411 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
412 return _Py_ERROR_STRICT;
413 }
414 if (wcscmp(errors, L"surrogateescape") == 0) {
415 return _Py_ERROR_SURROGATEESCAPE;
416 }
417 if (wcscmp(errors, L"replace") == 0) {
418 return _Py_ERROR_REPLACE;
419 }
420 if (wcscmp(errors, L"ignore") == 0) {
421 return _Py_ERROR_IGNORE;
422 }
423 if (wcscmp(errors, L"backslashreplace") == 0) {
424 return _Py_ERROR_BACKSLASHREPLACE;
425 }
426 if (wcscmp(errors, L"surrogatepass") == 0) {
427 return _Py_ERROR_SURROGATEPASS;
428 }
429 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
430 return _Py_ERROR_XMLCHARREFREPLACE;
431 }
432 return _Py_ERROR_OTHER;
433}
434
435
Victor Stinner22eb6892019-06-26 00:51:05 +0200436static inline int
437unicode_check_encoding_errors(const char *encoding, const char *errors)
438{
439 if (encoding == NULL && errors == NULL) {
440 return 0;
441 }
442
Victor Stinner81a7be32020-04-14 15:14:01 +0200443 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner22eb6892019-06-26 00:51:05 +0200444#ifndef Py_DEBUG
445 /* In release mode, only check in development mode (-X dev) */
Victor Stinnerda7933e2020-04-13 03:04:28 +0200446 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200447 return 0;
448 }
449#else
450 /* Always check in debug mode */
451#endif
452
453 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
454 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
Victor Stinner3d17c042020-05-14 01:48:38 +0200455 if (!interp->unicode.fs_codec.encoding) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200456 return 0;
457 }
458
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200459 /* Disable checks during Python finalization. For example, it allows to
460 call _PyObject_Dump() during finalization for debugging purpose. */
461 if (interp->finalizing) {
462 return 0;
463 }
464
Victor Stinner22eb6892019-06-26 00:51:05 +0200465 if (encoding != NULL) {
466 PyObject *handler = _PyCodec_Lookup(encoding);
467 if (handler == NULL) {
468 return -1;
469 }
470 Py_DECREF(handler);
471 }
472
473 if (errors != NULL) {
474 PyObject *handler = PyCodec_LookupError(errors);
475 if (handler == NULL) {
476 return -1;
477 }
478 Py_DECREF(handler);
479 }
480 return 0;
481}
482
483
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200484int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100485_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200486{
Victor Stinner68762572019-10-07 18:42:01 +0200487#define CHECK(expr) \
488 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
489
Victor Stinner910337b2011-10-03 03:20:16 +0200490 PyASCIIObject *ascii;
491 unsigned int kind;
492
Victor Stinner68762572019-10-07 18:42:01 +0200493 assert(op != NULL);
494 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200495
496 ascii = (PyASCIIObject *)op;
497 kind = ascii->state.kind;
498
Victor Stinnera3b334d2011-10-03 13:53:37 +0200499 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200500 CHECK(kind == PyUnicode_1BYTE_KIND);
501 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200502 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200503 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200504 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200505 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200506
Victor Stinnera41463c2011-10-04 01:05:08 +0200507 if (ascii->state.compact == 1) {
508 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200509 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200510 || kind == PyUnicode_2BYTE_KIND
511 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200512 CHECK(ascii->state.ascii == 0);
513 CHECK(ascii->state.ready == 1);
514 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100515 }
516 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200517 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
518
519 data = unicode->data.any;
520 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200521 CHECK(ascii->length == 0);
522 CHECK(ascii->hash == -1);
523 CHECK(ascii->state.compact == 0);
524 CHECK(ascii->state.ascii == 0);
525 CHECK(ascii->state.ready == 0);
526 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
527 CHECK(ascii->wstr != NULL);
528 CHECK(data == NULL);
529 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200530 }
531 else {
Victor Stinner68762572019-10-07 18:42:01 +0200532 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200533 || kind == PyUnicode_2BYTE_KIND
534 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200535 CHECK(ascii->state.compact == 0);
536 CHECK(ascii->state.ready == 1);
537 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200538 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200539 CHECK(compact->utf8 == data);
540 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200541 }
542 else
Victor Stinner68762572019-10-07 18:42:01 +0200543 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200544 }
545 }
546 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200547 if (
548#if SIZEOF_WCHAR_T == 2
549 kind == PyUnicode_2BYTE_KIND
550#else
551 kind == PyUnicode_4BYTE_KIND
552#endif
553 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200554 {
Victor Stinner68762572019-10-07 18:42:01 +0200555 CHECK(ascii->wstr == data);
556 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200557 } else
Victor Stinner68762572019-10-07 18:42:01 +0200558 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200559 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200560
561 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200562 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200563 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200564 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200565 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200566
567 /* check that the best kind is used: O(n) operation */
568 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200569 Py_ssize_t i;
570 Py_UCS4 maxchar = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300571 const void *data;
Victor Stinner718fbf02012-04-26 00:39:37 +0200572 Py_UCS4 ch;
573
574 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200575 for (i=0; i < ascii->length; i++)
576 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200577 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200578 if (ch > maxchar)
579 maxchar = ch;
580 }
581 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100582 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200583 CHECK(maxchar >= 128);
584 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100585 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200586 else
Victor Stinner68762572019-10-07 18:42:01 +0200587 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200588 }
Victor Stinner77faf692011-11-20 18:56:05 +0100589 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200590 CHECK(maxchar >= 0x100);
591 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100592 }
593 else {
Victor Stinner68762572019-10-07 18:42:01 +0200594 CHECK(maxchar >= 0x10000);
595 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100596 }
Victor Stinner68762572019-10-07 18:42:01 +0200597 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200598 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400599 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200600
601#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400602}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200603
Victor Stinner910337b2011-10-03 03:20:16 +0200604
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100605static PyObject*
606unicode_result_wchar(PyObject *unicode)
607{
608#ifndef Py_DEBUG
609 Py_ssize_t len;
610
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100611 len = _PyUnicode_WSTR_LENGTH(unicode);
612 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100613 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200614 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100615 }
616
617 if (len == 1) {
618 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100619 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100620 Py_DECREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200621 return get_latin1_char((unsigned char)ch);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100622 }
623 }
624
625 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200626 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100627 return NULL;
628 }
629#else
Victor Stinneraa771272012-10-04 02:32:58 +0200630 assert(Py_REFCNT(unicode) == 1);
631
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100632 /* don't make the result ready in debug mode to ensure that the caller
633 makes the string ready before using it */
634 assert(_PyUnicode_CheckConsistency(unicode, 1));
635#endif
636 return unicode;
637}
638
639static PyObject*
640unicode_result_ready(PyObject *unicode)
641{
642 Py_ssize_t length;
643
644 length = PyUnicode_GET_LENGTH(unicode);
645 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200646 PyObject *empty = unicode_get_empty();
647 if (unicode != empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100648 Py_DECREF(unicode);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200649 Py_INCREF(empty);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100650 }
Victor Stinner90ed8a62020-06-24 00:34:07 +0200651 return empty;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100652 }
653
654 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200655 int kind = PyUnicode_KIND(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200656 if (kind == PyUnicode_1BYTE_KIND) {
657 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
658 Py_UCS1 ch = data[0];
659 struct _Py_unicode_state *state = get_unicode_state();
660 PyObject *latin1_char = state->latin1[ch];
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100661 if (latin1_char != NULL) {
662 if (unicode != latin1_char) {
663 Py_INCREF(latin1_char);
664 Py_DECREF(unicode);
665 }
666 return latin1_char;
667 }
668 else {
669 assert(_PyUnicode_CheckConsistency(unicode, 1));
670 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200671 state->latin1[ch] = unicode;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100672 return unicode;
673 }
674 }
Victor Stinner2f9ada92020-06-24 02:22:21 +0200675 else {
676 assert(PyUnicode_READ_CHAR(unicode, 0) >= 256);
677 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100678 }
679
680 assert(_PyUnicode_CheckConsistency(unicode, 1));
681 return unicode;
682}
683
684static PyObject*
685unicode_result(PyObject *unicode)
686{
687 assert(_PyUnicode_CHECK(unicode));
688 if (PyUnicode_IS_READY(unicode))
689 return unicode_result_ready(unicode);
690 else
691 return unicode_result_wchar(unicode);
692}
693
Victor Stinnerc4b49542011-12-11 22:44:26 +0100694static PyObject*
695unicode_result_unchanged(PyObject *unicode)
696{
697 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500698 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100699 return NULL;
700 Py_INCREF(unicode);
701 return unicode;
702 }
703 else
704 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100705 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100706}
707
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200708/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
709 ASCII, Latin1, UTF-8, etc. */
710static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200711backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200712 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
713{
Victor Stinnerad771582015-10-09 12:38:53 +0200714 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200715 Py_UCS4 ch;
716 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300717 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200718
719 assert(PyUnicode_IS_READY(unicode));
720 kind = PyUnicode_KIND(unicode);
721 data = PyUnicode_DATA(unicode);
722
723 size = 0;
724 /* determine replacement size */
725 for (i = collstart; i < collend; ++i) {
726 Py_ssize_t incr;
727
728 ch = PyUnicode_READ(kind, data, i);
729 if (ch < 0x100)
730 incr = 2+2;
731 else if (ch < 0x10000)
732 incr = 2+4;
733 else {
734 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200735 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200736 }
737 if (size > PY_SSIZE_T_MAX - incr) {
738 PyErr_SetString(PyExc_OverflowError,
739 "encoded result is too long for a Python string");
740 return NULL;
741 }
742 size += incr;
743 }
744
Victor Stinnerad771582015-10-09 12:38:53 +0200745 str = _PyBytesWriter_Prepare(writer, str, size);
746 if (str == NULL)
747 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200748
749 /* generate replacement */
750 for (i = collstart; i < collend; ++i) {
751 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200752 *str++ = '\\';
753 if (ch >= 0x00010000) {
754 *str++ = 'U';
755 *str++ = Py_hexdigits[(ch>>28)&0xf];
756 *str++ = Py_hexdigits[(ch>>24)&0xf];
757 *str++ = Py_hexdigits[(ch>>20)&0xf];
758 *str++ = Py_hexdigits[(ch>>16)&0xf];
759 *str++ = Py_hexdigits[(ch>>12)&0xf];
760 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200761 }
Victor Stinner797485e2015-10-09 03:17:30 +0200762 else if (ch >= 0x100) {
763 *str++ = 'u';
764 *str++ = Py_hexdigits[(ch>>12)&0xf];
765 *str++ = Py_hexdigits[(ch>>8)&0xf];
766 }
767 else
768 *str++ = 'x';
769 *str++ = Py_hexdigits[(ch>>4)&0xf];
770 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200771 }
772 return str;
773}
774
775/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
776 ASCII, Latin1, UTF-8, etc. */
777static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200778xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200779 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
780{
Victor Stinnerad771582015-10-09 12:38:53 +0200781 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200782 Py_UCS4 ch;
783 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300784 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200785
786 assert(PyUnicode_IS_READY(unicode));
787 kind = PyUnicode_KIND(unicode);
788 data = PyUnicode_DATA(unicode);
789
790 size = 0;
791 /* determine replacement size */
792 for (i = collstart; i < collend; ++i) {
793 Py_ssize_t incr;
794
795 ch = PyUnicode_READ(kind, data, i);
796 if (ch < 10)
797 incr = 2+1+1;
798 else if (ch < 100)
799 incr = 2+2+1;
800 else if (ch < 1000)
801 incr = 2+3+1;
802 else if (ch < 10000)
803 incr = 2+4+1;
804 else if (ch < 100000)
805 incr = 2+5+1;
806 else if (ch < 1000000)
807 incr = 2+6+1;
808 else {
809 assert(ch <= MAX_UNICODE);
810 incr = 2+7+1;
811 }
812 if (size > PY_SSIZE_T_MAX - incr) {
813 PyErr_SetString(PyExc_OverflowError,
814 "encoded result is too long for a Python string");
815 return NULL;
816 }
817 size += incr;
818 }
819
Victor Stinnerad771582015-10-09 12:38:53 +0200820 str = _PyBytesWriter_Prepare(writer, str, size);
821 if (str == NULL)
822 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200823
824 /* generate replacement */
825 for (i = collstart; i < collend; ++i) {
Christian Heimes07f2ade2020-11-18 16:38:53 +0100826 size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
827 if (size < 0) {
828 return NULL;
829 }
830 str += size;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200831 }
832 return str;
833}
834
Thomas Wouters477c8d52006-05-27 19:21:47 +0000835/* --- Bloom Filters ----------------------------------------------------- */
836
837/* stuff to implement simple "bloom filters" for Unicode characters.
838 to keep things simple, we use a single bitmask, using the least 5
839 bits from each unicode characters as the bit index. */
840
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200841/* the linebreak mask is set up by _PyUnicode_Init() below */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000842
Antoine Pitrouf068f942010-01-13 14:19:12 +0000843#if LONG_BIT >= 128
844#define BLOOM_WIDTH 128
845#elif LONG_BIT >= 64
846#define BLOOM_WIDTH 64
847#elif LONG_BIT >= 32
848#define BLOOM_WIDTH 32
849#else
850#error "LONG_BIT is smaller than 32"
851#endif
852
Thomas Wouters477c8d52006-05-27 19:21:47 +0000853#define BLOOM_MASK unsigned long
854
Serhiy Storchaka05997252013-01-26 12:14:02 +0200855static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000856
Antoine Pitrouf068f942010-01-13 14:19:12 +0000857#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000858
Benjamin Peterson29060642009-01-31 22:14:21 +0000859#define BLOOM_LINEBREAK(ch) \
860 ((ch) < 128U ? ascii_linebreak[(ch)] : \
861 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000862
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700863static inline BLOOM_MASK
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300864make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000865{
Victor Stinnera85af502013-04-09 21:53:54 +0200866#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
867 do { \
868 TYPE *data = (TYPE *)PTR; \
869 TYPE *end = data + LEN; \
870 Py_UCS4 ch; \
871 for (; data != end; data++) { \
872 ch = *data; \
873 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
874 } \
875 break; \
876 } while (0)
877
Thomas Wouters477c8d52006-05-27 19:21:47 +0000878 /* calculate simple bloom-style bitmask for a given unicode string */
879
Antoine Pitrouf068f942010-01-13 14:19:12 +0000880 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000881
882 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200883 switch (kind) {
884 case PyUnicode_1BYTE_KIND:
885 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
886 break;
887 case PyUnicode_2BYTE_KIND:
888 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
889 break;
890 case PyUnicode_4BYTE_KIND:
891 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
892 break;
893 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700894 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200895 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000896 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200897
898#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000899}
900
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300901static int
902ensure_unicode(PyObject *obj)
903{
904 if (!PyUnicode_Check(obj)) {
905 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200906 "must be str, not %.100s",
907 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300908 return -1;
909 }
910 return PyUnicode_READY(obj);
911}
912
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200913/* Compilation of templated routines */
914
Victor Stinner90ed8a62020-06-24 00:34:07 +0200915#define STRINGLIB_GET_EMPTY() unicode_get_empty()
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200916
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200917#include "stringlib/asciilib.h"
918#include "stringlib/fastsearch.h"
919#include "stringlib/partition.h"
920#include "stringlib/split.h"
921#include "stringlib/count.h"
922#include "stringlib/find.h"
923#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200924#include "stringlib/undef.h"
925
926#include "stringlib/ucs1lib.h"
927#include "stringlib/fastsearch.h"
928#include "stringlib/partition.h"
929#include "stringlib/split.h"
930#include "stringlib/count.h"
931#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300932#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200933#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200934#include "stringlib/undef.h"
935
936#include "stringlib/ucs2lib.h"
937#include "stringlib/fastsearch.h"
938#include "stringlib/partition.h"
939#include "stringlib/split.h"
940#include "stringlib/count.h"
941#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300942#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200943#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200944#include "stringlib/undef.h"
945
946#include "stringlib/ucs4lib.h"
947#include "stringlib/fastsearch.h"
948#include "stringlib/partition.h"
949#include "stringlib/split.h"
950#include "stringlib/count.h"
951#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300952#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200953#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200954#include "stringlib/undef.h"
955
Inada Naoki2c4928d2020-06-17 20:09:44 +0900956_Py_COMP_DIAG_PUSH
957_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200958#include "stringlib/unicodedefs.h"
959#include "stringlib/fastsearch.h"
960#include "stringlib/count.h"
961#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100962#include "stringlib/undef.h"
Inada Naoki2c4928d2020-06-17 20:09:44 +0900963_Py_COMP_DIAG_POP
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200964
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200965#undef STRINGLIB_GET_EMPTY
966
Guido van Rossumd57fd912000-03-10 22:53:23 +0000967/* --- Unicode Object ----------------------------------------------------- */
968
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700969static inline Py_ssize_t
970findchar(const void *s, int kind,
971 Py_ssize_t size, Py_UCS4 ch,
972 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200973{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200974 switch (kind) {
975 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200976 if ((Py_UCS1) ch != ch)
977 return -1;
978 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600979 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200980 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600981 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200982 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200983 if ((Py_UCS2) ch != ch)
984 return -1;
985 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600986 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200987 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600988 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200989 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200990 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600991 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200992 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600993 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200994 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700995 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200996 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200997}
998
Victor Stinnerafffce42012-10-03 23:03:17 +0200999#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +00001000/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +02001001 earlier.
1002
1003 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1004 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1005 invalid character in Unicode 6.0. */
1006static void
1007unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1008{
1009 int kind = PyUnicode_KIND(unicode);
1010 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1011 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1012 if (length <= old_length)
1013 return;
1014 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1015}
1016#endif
1017
Victor Stinnerfe226c02011-10-03 03:52:20 +02001018static PyObject*
1019resize_compact(PyObject *unicode, Py_ssize_t length)
1020{
1021 Py_ssize_t char_size;
1022 Py_ssize_t struct_size;
1023 Py_ssize_t new_size;
1024 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001025 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001026#ifdef Py_DEBUG
1027 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1028#endif
1029
Victor Stinner79891572012-05-03 13:43:07 +02001030 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001031 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001032 assert(PyUnicode_IS_COMPACT(unicode));
1033
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001034 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001035 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001036 struct_size = sizeof(PyASCIIObject);
1037 else
1038 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001039 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001040
Victor Stinnerfe226c02011-10-03 03:52:20 +02001041 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1042 PyErr_NoMemory();
1043 return NULL;
1044 }
1045 new_size = (struct_size + (length + 1) * char_size);
1046
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001047 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001048 PyObject_Free(_PyUnicode_UTF8(unicode));
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001049 _PyUnicode_UTF8(unicode) = NULL;
1050 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1051 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001052#ifdef Py_REF_DEBUG
1053 _Py_RefTotal--;
1054#endif
1055#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001056 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001057#endif
Victor Stinner84def372011-12-11 20:04:56 +01001058
Victor Stinner32bd68c2020-12-01 10:37:39 +01001059 new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001060 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001061 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001062 PyErr_NoMemory();
1063 return NULL;
1064 }
Victor Stinner84def372011-12-11 20:04:56 +01001065 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001066 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001067
Victor Stinnerfe226c02011-10-03 03:52:20 +02001068 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001069 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001070 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001071 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001072 _PyUnicode_WSTR_LENGTH(unicode) = length;
1073 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001074 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001075 PyObject_Free(_PyUnicode_WSTR(unicode));
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001076 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001077 if (!PyUnicode_IS_ASCII(unicode))
1078 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001079 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001080#ifdef Py_DEBUG
1081 unicode_fill_invalid(unicode, old_length);
1082#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001083 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1084 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001085 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001086 return unicode;
1087}
1088
Alexander Belopolsky40018472011-02-26 01:02:56 +00001089static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001090resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091{
Victor Stinner95663112011-10-04 01:03:50 +02001092 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001093 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001094 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001095 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001096
Victor Stinnerfe226c02011-10-03 03:52:20 +02001097 if (PyUnicode_IS_READY(unicode)) {
1098 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001099 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001100 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001101#ifdef Py_DEBUG
1102 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1103#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001104
1105 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001106 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001107 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1108 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001109
1110 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1111 PyErr_NoMemory();
1112 return -1;
1113 }
1114 new_size = (length + 1) * char_size;
1115
Victor Stinner7a9105a2011-12-12 00:13:42 +01001116 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1117 {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001118 PyObject_Free(_PyUnicode_UTF8(unicode));
Victor Stinner7a9105a2011-12-12 00:13:42 +01001119 _PyUnicode_UTF8(unicode) = NULL;
1120 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1121 }
1122
Victor Stinner32bd68c2020-12-01 10:37:39 +01001123 data = (PyObject *)PyObject_Realloc(data, new_size);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001124 if (data == NULL) {
1125 PyErr_NoMemory();
1126 return -1;
1127 }
1128 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001129 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001130 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001131 _PyUnicode_WSTR_LENGTH(unicode) = length;
1132 }
1133 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001134 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001135 _PyUnicode_UTF8_LENGTH(unicode) = length;
1136 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001137 _PyUnicode_LENGTH(unicode) = length;
1138 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001139#ifdef Py_DEBUG
1140 unicode_fill_invalid(unicode, old_length);
1141#endif
Victor Stinner95663112011-10-04 01:03:50 +02001142 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001143 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001144 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001145 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001146 }
Victor Stinner95663112011-10-04 01:03:50 +02001147 assert(_PyUnicode_WSTR(unicode) != NULL);
1148
1149 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001150 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001151 PyErr_NoMemory();
1152 return -1;
1153 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001154 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001155 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner32bd68c2020-12-01 10:37:39 +01001156 wstr = PyObject_Realloc(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001157 if (!wstr) {
1158 PyErr_NoMemory();
1159 return -1;
1160 }
1161 _PyUnicode_WSTR(unicode) = wstr;
1162 _PyUnicode_WSTR(unicode)[length] = 0;
1163 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001164 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165 return 0;
1166}
1167
Victor Stinnerfe226c02011-10-03 03:52:20 +02001168static PyObject*
1169resize_copy(PyObject *unicode, Py_ssize_t length)
1170{
1171 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001172 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001173 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001174
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001175 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001176
1177 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1178 if (copy == NULL)
1179 return NULL;
1180
1181 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001182 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001183 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001184 }
1185 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001186 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001187
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001188 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001189 if (w == NULL)
1190 return NULL;
1191 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1192 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001193 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001194 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001195 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001196 }
1197}
1198
Guido van Rossumd57fd912000-03-10 22:53:23 +00001199/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001200 Ux0000 terminated; some code (e.g. new_identifier)
1201 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001202
1203 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001204 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001205
1206*/
1207
Alexander Belopolsky40018472011-02-26 01:02:56 +00001208static PyUnicodeObject *
1209_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001210{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001211 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001212 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213
Thomas Wouters477c8d52006-05-27 19:21:47 +00001214 /* Optimization for empty strings */
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001215 if (length == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001216 return (PyUnicodeObject *)unicode_new_empty();
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217 }
1218
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001219 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001220 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001221 return (PyUnicodeObject *)PyErr_NoMemory();
1222 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001223 if (length < 0) {
1224 PyErr_SetString(PyExc_SystemError,
1225 "Negative size passed to _PyUnicode_New");
1226 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001227 }
1228
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001229 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1230 if (unicode == NULL)
1231 return NULL;
1232 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001233
1234 _PyUnicode_WSTR_LENGTH(unicode) = length;
1235 _PyUnicode_HASH(unicode) = -1;
1236 _PyUnicode_STATE(unicode).interned = 0;
1237 _PyUnicode_STATE(unicode).kind = 0;
1238 _PyUnicode_STATE(unicode).compact = 0;
1239 _PyUnicode_STATE(unicode).ready = 0;
1240 _PyUnicode_STATE(unicode).ascii = 0;
1241 _PyUnicode_DATA_ANY(unicode) = NULL;
1242 _PyUnicode_LENGTH(unicode) = 0;
1243 _PyUnicode_UTF8(unicode) = NULL;
1244 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1245
Victor Stinner32bd68c2020-12-01 10:37:39 +01001246 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_Malloc(new_size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001247 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001248 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001249 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001250 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001251 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001252
Jeremy Hyltond8082792003-09-16 19:41:39 +00001253 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001254 * the caller fails before initializing str -- unicode_resize()
1255 * reads str[0], and the Keep-Alive optimization can keep memory
1256 * allocated for str alive across a call to unicode_dealloc(unicode).
1257 * We don't want unicode_resize to read uninitialized memory in
1258 * that case.
1259 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001260 _PyUnicode_WSTR(unicode)[0] = 0;
1261 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001262
Victor Stinner7931d9a2011-11-04 00:22:48 +01001263 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001264 return unicode;
1265}
1266
Victor Stinnerf42dc442011-10-02 23:33:16 +02001267static const char*
1268unicode_kind_name(PyObject *unicode)
1269{
Victor Stinner42dfd712011-10-03 14:41:45 +02001270 /* don't check consistency: unicode_kind_name() is called from
1271 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001272 if (!PyUnicode_IS_COMPACT(unicode))
1273 {
1274 if (!PyUnicode_IS_READY(unicode))
1275 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001276 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001277 {
1278 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001279 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001280 return "legacy ascii";
1281 else
1282 return "legacy latin1";
1283 case PyUnicode_2BYTE_KIND:
1284 return "legacy UCS2";
1285 case PyUnicode_4BYTE_KIND:
1286 return "legacy UCS4";
1287 default:
1288 return "<legacy invalid kind>";
1289 }
1290 }
1291 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001292 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001293 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001294 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001295 return "ascii";
1296 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001297 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001298 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001299 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001300 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001301 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001302 default:
1303 return "<invalid compact kind>";
1304 }
1305}
1306
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001307#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001308/* Functions wrapping macros for use in debugger */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001309const char *_PyUnicode_utf8(void *unicode_raw){
Victor Stinnera42de742018-11-22 10:25:22 +01001310 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001311 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312}
1313
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001314const void *_PyUnicode_compact_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001315 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316 return _PyUnicode_COMPACT_DATA(unicode);
1317}
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001318const void *_PyUnicode_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001319 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001320 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001321 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1322 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1323 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1324 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1325 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1326 return PyUnicode_DATA(unicode);
1327}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001328
1329void
1330_PyUnicode_Dump(PyObject *op)
1331{
1332 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001333 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1334 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001335 const void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001336
Victor Stinnera849a4b2011-10-03 12:12:11 +02001337 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001338 {
1339 if (ascii->state.ascii)
1340 data = (ascii + 1);
1341 else
1342 data = (compact + 1);
1343 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001344 else
1345 data = unicode->data.any;
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001346 printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001347
Victor Stinnera849a4b2011-10-03 12:12:11 +02001348 if (ascii->wstr == data)
1349 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001350 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001351
Victor Stinnera3b334d2011-10-03 13:53:37 +02001352 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001353 printf(" (%zu), ", compact->wstr_length);
1354 if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001355 printf("shared ");
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001356 }
1357 printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001358 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001359 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001360}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001361#endif
1362
Victor Stinner91698d82020-06-25 14:07:40 +02001363static int
1364unicode_create_empty_string_singleton(struct _Py_unicode_state *state)
1365{
1366 // Use size=1 rather than size=0, so PyUnicode_New(0, maxchar) can be
1367 // optimized to always use state->empty_string without having to check if
1368 // it is NULL or not.
1369 PyObject *empty = PyUnicode_New(1, 0);
1370 if (empty == NULL) {
1371 return -1;
1372 }
1373 PyUnicode_1BYTE_DATA(empty)[0] = 0;
1374 _PyUnicode_LENGTH(empty) = 0;
1375 assert(_PyUnicode_CheckConsistency(empty, 1));
1376
1377 assert(state->empty_string == NULL);
1378 state->empty_string = empty;
1379 return 0;
1380}
1381
1382
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383PyObject *
1384PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1385{
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001386 /* Optimization for empty strings */
1387 if (size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001388 return unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001389 }
1390
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001391 PyObject *obj;
1392 PyCompactUnicodeObject *unicode;
1393 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001394 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001395 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001396 Py_ssize_t char_size;
1397 Py_ssize_t struct_size;
1398
Victor Stinner9e9d6892011-10-04 01:02:02 +02001399 is_ascii = 0;
1400 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401 struct_size = sizeof(PyCompactUnicodeObject);
1402 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001403 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001404 char_size = 1;
1405 is_ascii = 1;
1406 struct_size = sizeof(PyASCIIObject);
1407 }
1408 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001409 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001410 char_size = 1;
1411 }
1412 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001413 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414 char_size = 2;
1415 if (sizeof(wchar_t) == 2)
1416 is_sharing = 1;
1417 }
1418 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001419 if (maxchar > MAX_UNICODE) {
1420 PyErr_SetString(PyExc_SystemError,
1421 "invalid maximum character passed to PyUnicode_New");
1422 return NULL;
1423 }
Victor Stinner8f825062012-04-27 13:55:39 +02001424 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001425 char_size = 4;
1426 if (sizeof(wchar_t) == 4)
1427 is_sharing = 1;
1428 }
1429
1430 /* Ensure we won't overflow the size. */
1431 if (size < 0) {
1432 PyErr_SetString(PyExc_SystemError,
1433 "Negative size passed to PyUnicode_New");
1434 return NULL;
1435 }
1436 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1437 return PyErr_NoMemory();
1438
1439 /* Duplicated allocation code from _PyObject_New() instead of a call to
1440 * PyObject_New() so we are able to allocate space for the object and
1441 * it's data buffer.
1442 */
Victor Stinner32bd68c2020-12-01 10:37:39 +01001443 obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
Victor Stinner04fc4f22020-06-16 01:28:07 +02001444 if (obj == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001445 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02001446 }
1447 _PyObject_Init(obj, &PyUnicode_Type);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001448
1449 unicode = (PyCompactUnicodeObject *)obj;
1450 if (is_ascii)
1451 data = ((PyASCIIObject*)obj) + 1;
1452 else
1453 data = unicode + 1;
1454 _PyUnicode_LENGTH(unicode) = size;
1455 _PyUnicode_HASH(unicode) = -1;
1456 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001457 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001458 _PyUnicode_STATE(unicode).compact = 1;
1459 _PyUnicode_STATE(unicode).ready = 1;
1460 _PyUnicode_STATE(unicode).ascii = is_ascii;
1461 if (is_ascii) {
1462 ((char*)data)[size] = 0;
1463 _PyUnicode_WSTR(unicode) = NULL;
1464 }
Victor Stinner8f825062012-04-27 13:55:39 +02001465 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001466 ((char*)data)[size] = 0;
1467 _PyUnicode_WSTR(unicode) = NULL;
1468 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001469 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001470 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001471 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001472 else {
1473 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001474 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001475 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001477 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478 ((Py_UCS4*)data)[size] = 0;
1479 if (is_sharing) {
1480 _PyUnicode_WSTR_LENGTH(unicode) = size;
1481 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1482 }
1483 else {
1484 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1485 _PyUnicode_WSTR(unicode) = NULL;
1486 }
1487 }
Victor Stinner8f825062012-04-27 13:55:39 +02001488#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001489 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001490#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001491 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001492 return obj;
1493}
1494
1495#if SIZEOF_WCHAR_T == 2
1496/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1497 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001498 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001499
1500 This function assumes that unicode can hold one more code point than wstr
1501 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001502static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001503unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001504 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001505{
1506 const wchar_t *iter;
1507 Py_UCS4 *ucs4_out;
1508
Victor Stinner910337b2011-10-03 03:20:16 +02001509 assert(unicode != NULL);
1510 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1512 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1513
1514 for (iter = begin; iter < end; ) {
1515 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1516 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001517 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1518 && (iter+1) < end
1519 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001520 {
Victor Stinner551ac952011-11-29 22:58:13 +01001521 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001522 iter += 2;
1523 }
1524 else {
1525 *ucs4_out++ = *iter;
1526 iter++;
1527 }
1528 }
1529 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1530 _PyUnicode_GET_LENGTH(unicode)));
1531
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001532}
1533#endif
1534
Victor Stinnercd9950f2011-10-02 00:34:53 +02001535static int
Victor Stinner488fa492011-12-12 00:01:39 +01001536unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001537{
Victor Stinner488fa492011-12-12 00:01:39 +01001538 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001539 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001540 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001541 return -1;
1542 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001543 return 0;
1544}
1545
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001546static int
1547_copy_characters(PyObject *to, Py_ssize_t to_start,
1548 PyObject *from, Py_ssize_t from_start,
1549 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001550{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001551 unsigned int from_kind, to_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001552 const void *from_data;
1553 void *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001554
Victor Stinneree4544c2012-05-09 22:24:08 +02001555 assert(0 <= how_many);
1556 assert(0 <= from_start);
1557 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001558 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001559 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001560 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001561
Victor Stinnerd3f08822012-05-29 12:57:52 +02001562 assert(PyUnicode_Check(to));
1563 assert(PyUnicode_IS_READY(to));
1564 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1565
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001566 if (how_many == 0)
1567 return 0;
1568
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001569 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001570 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001571 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001572 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001573
Victor Stinnerf1852262012-06-16 16:38:26 +02001574#ifdef Py_DEBUG
1575 if (!check_maxchar
1576 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1577 {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001578 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerf1852262012-06-16 16:38:26 +02001579 Py_UCS4 ch;
1580 Py_ssize_t i;
1581 for (i=0; i < how_many; i++) {
1582 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1583 assert(ch <= to_maxchar);
1584 }
1585 }
1586#endif
1587
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001588 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001589 if (check_maxchar
1590 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1591 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001592 /* Writing Latin-1 characters into an ASCII string requires to
1593 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001594 Py_UCS4 max_char;
1595 max_char = ucs1lib_find_max_char(from_data,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001596 (const Py_UCS1*)from_data + how_many);
Victor Stinnerf1852262012-06-16 16:38:26 +02001597 if (max_char >= 128)
1598 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001599 }
Christian Heimesf051e432016-09-13 20:22:02 +02001600 memcpy((char*)to_data + to_kind * to_start,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001601 (const char*)from_data + from_kind * from_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001602 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001603 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001604 else if (from_kind == PyUnicode_1BYTE_KIND
1605 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001606 {
1607 _PyUnicode_CONVERT_BYTES(
1608 Py_UCS1, Py_UCS2,
1609 PyUnicode_1BYTE_DATA(from) + from_start,
1610 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1611 PyUnicode_2BYTE_DATA(to) + to_start
1612 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001613 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001614 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001615 && to_kind == PyUnicode_4BYTE_KIND)
1616 {
1617 _PyUnicode_CONVERT_BYTES(
1618 Py_UCS1, Py_UCS4,
1619 PyUnicode_1BYTE_DATA(from) + from_start,
1620 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1621 PyUnicode_4BYTE_DATA(to) + to_start
1622 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001623 }
1624 else if (from_kind == PyUnicode_2BYTE_KIND
1625 && to_kind == PyUnicode_4BYTE_KIND)
1626 {
1627 _PyUnicode_CONVERT_BYTES(
1628 Py_UCS2, Py_UCS4,
1629 PyUnicode_2BYTE_DATA(from) + from_start,
1630 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1631 PyUnicode_4BYTE_DATA(to) + to_start
1632 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001633 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001634 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001635 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1636
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001637 if (!check_maxchar) {
1638 if (from_kind == PyUnicode_2BYTE_KIND
1639 && to_kind == PyUnicode_1BYTE_KIND)
1640 {
1641 _PyUnicode_CONVERT_BYTES(
1642 Py_UCS2, Py_UCS1,
1643 PyUnicode_2BYTE_DATA(from) + from_start,
1644 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1645 PyUnicode_1BYTE_DATA(to) + to_start
1646 );
1647 }
1648 else if (from_kind == PyUnicode_4BYTE_KIND
1649 && to_kind == PyUnicode_1BYTE_KIND)
1650 {
1651 _PyUnicode_CONVERT_BYTES(
1652 Py_UCS4, Py_UCS1,
1653 PyUnicode_4BYTE_DATA(from) + from_start,
1654 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1655 PyUnicode_1BYTE_DATA(to) + to_start
1656 );
1657 }
1658 else if (from_kind == PyUnicode_4BYTE_KIND
1659 && to_kind == PyUnicode_2BYTE_KIND)
1660 {
1661 _PyUnicode_CONVERT_BYTES(
1662 Py_UCS4, Py_UCS2,
1663 PyUnicode_4BYTE_DATA(from) + from_start,
1664 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1665 PyUnicode_2BYTE_DATA(to) + to_start
1666 );
1667 }
1668 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001669 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001670 }
1671 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001672 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001673 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001674 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001675 Py_ssize_t i;
1676
Victor Stinnera0702ab2011-09-29 14:14:38 +02001677 for (i=0; i < how_many; i++) {
1678 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001679 if (ch > to_maxchar)
1680 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001681 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1682 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001683 }
1684 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001685 return 0;
1686}
1687
Victor Stinnerd3f08822012-05-29 12:57:52 +02001688void
1689_PyUnicode_FastCopyCharacters(
1690 PyObject *to, Py_ssize_t to_start,
1691 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001692{
1693 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1694}
1695
1696Py_ssize_t
1697PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1698 PyObject *from, Py_ssize_t from_start,
1699 Py_ssize_t how_many)
1700{
1701 int err;
1702
1703 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1704 PyErr_BadInternalCall();
1705 return -1;
1706 }
1707
Benjamin Petersonbac79492012-01-14 13:34:47 -05001708 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001709 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001710 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001711 return -1;
1712
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001713 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001714 PyErr_SetString(PyExc_IndexError, "string index out of range");
1715 return -1;
1716 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001717 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001718 PyErr_SetString(PyExc_IndexError, "string index out of range");
1719 return -1;
1720 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001721 if (how_many < 0) {
1722 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1723 return -1;
1724 }
1725 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001726 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1727 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001728 "Cannot write %zi characters at %zi "
1729 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001730 how_many, to_start, PyUnicode_GET_LENGTH(to));
1731 return -1;
1732 }
1733
1734 if (how_many == 0)
1735 return 0;
1736
Victor Stinner488fa492011-12-12 00:01:39 +01001737 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001738 return -1;
1739
1740 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1741 if (err) {
1742 PyErr_Format(PyExc_SystemError,
1743 "Cannot copy %s characters "
1744 "into a string of %s characters",
1745 unicode_kind_name(from),
1746 unicode_kind_name(to));
1747 return -1;
1748 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001749 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750}
1751
Victor Stinner17222162011-09-28 22:15:37 +02001752/* Find the maximum code point and count the number of surrogate pairs so a
1753 correct string length can be computed before converting a string to UCS4.
1754 This function counts single surrogates as a character and not as a pair.
1755
1756 Return 0 on success, or -1 on error. */
1757static int
1758find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1759 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760{
1761 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001762 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001763
Victor Stinnerc53be962011-10-02 21:33:54 +02001764 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001765 *num_surrogates = 0;
1766 *maxchar = 0;
1767
1768 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001770 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1771 && (iter+1) < end
1772 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1773 {
1774 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1775 ++(*num_surrogates);
1776 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777 }
1778 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001779#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001780 {
1781 ch = *iter;
1782 iter++;
1783 }
1784 if (ch > *maxchar) {
1785 *maxchar = ch;
1786 if (*maxchar > MAX_UNICODE) {
1787 PyErr_Format(PyExc_ValueError,
Victor Stinner99768342021-03-17 21:46:53 +01001788 "character U+%x is not in range [U+0000; U+%x]",
1789 ch, MAX_UNICODE);
Victor Stinner8faf8212011-12-08 22:14:11 +01001790 return -1;
1791 }
1792 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 }
1794 return 0;
1795}
1796
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001797int
1798_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001799{
1800 wchar_t *end;
1801 Py_UCS4 maxchar = 0;
1802 Py_ssize_t num_surrogates;
1803#if SIZEOF_WCHAR_T == 2
1804 Py_ssize_t length_wo_surrogates;
1805#endif
1806
Georg Brandl7597add2011-10-05 16:36:47 +02001807 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001808 strings were created using _PyObject_New() and where no canonical
1809 representation (the str field) has been set yet aka strings
1810 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001811 assert(_PyUnicode_CHECK(unicode));
1812 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001813 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001814 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001815 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001816 /* Actually, it should neither be interned nor be anything else: */
1817 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001818
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001819 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001820 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001821 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001822 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823
1824 if (maxchar < 256) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001825 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(_PyUnicode_WSTR_LENGTH(unicode) + 1);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001826 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001827 PyErr_NoMemory();
1828 return -1;
1829 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001830 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 _PyUnicode_WSTR(unicode), end,
1832 PyUnicode_1BYTE_DATA(unicode));
1833 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1834 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1835 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1836 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001837 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001838 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001839 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001840 }
1841 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001842 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001843 _PyUnicode_UTF8(unicode) = NULL;
1844 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001845 }
Victor Stinner32bd68c2020-12-01 10:37:39 +01001846 PyObject_Free(_PyUnicode_WSTR(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001847 _PyUnicode_WSTR(unicode) = NULL;
1848 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1849 }
1850 /* In this case we might have to convert down from 4-byte native
1851 wchar_t to 2-byte unicode. */
1852 else if (maxchar < 65536) {
1853 assert(num_surrogates == 0 &&
1854 "FindMaxCharAndNumSurrogatePairs() messed up");
1855
Victor Stinner506f5922011-09-28 22:34:18 +02001856#if SIZEOF_WCHAR_T == 2
1857 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001858 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001859 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1860 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1861 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001862 _PyUnicode_UTF8(unicode) = NULL;
1863 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001864#else
1865 /* sizeof(wchar_t) == 4 */
Victor Stinner32bd68c2020-12-01 10:37:39 +01001866 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(
Victor Stinner506f5922011-09-28 22:34:18 +02001867 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001868 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001869 PyErr_NoMemory();
1870 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001871 }
Victor Stinner506f5922011-09-28 22:34:18 +02001872 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1873 _PyUnicode_WSTR(unicode), end,
1874 PyUnicode_2BYTE_DATA(unicode));
1875 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1876 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1877 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001878 _PyUnicode_UTF8(unicode) = NULL;
1879 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner32bd68c2020-12-01 10:37:39 +01001880 PyObject_Free(_PyUnicode_WSTR(unicode));
Victor Stinner506f5922011-09-28 22:34:18 +02001881 _PyUnicode_WSTR(unicode) = NULL;
1882 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1883#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001884 }
Ikko Ashimine38811d62020-11-10 14:57:34 +09001885 /* maxchar exceeds 16 bit, wee need 4 bytes for unicode characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001886 else {
1887#if SIZEOF_WCHAR_T == 2
1888 /* in case the native representation is 2-bytes, we need to allocate a
1889 new normalized 4-byte version. */
1890 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001891 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1892 PyErr_NoMemory();
1893 return -1;
1894 }
Victor Stinner32bd68c2020-12-01 10:37:39 +01001895 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(4 * (length_wo_surrogates + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001896 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001897 PyErr_NoMemory();
1898 return -1;
1899 }
1900 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1901 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001902 _PyUnicode_UTF8(unicode) = NULL;
1903 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001904 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1905 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001906 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Victor Stinner32bd68c2020-12-01 10:37:39 +01001907 PyObject_Free(_PyUnicode_WSTR(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001908 _PyUnicode_WSTR(unicode) = NULL;
1909 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1910#else
1911 assert(num_surrogates == 0);
1912
Victor Stinnerc3c74152011-10-02 20:39:55 +02001913 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001914 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001915 _PyUnicode_UTF8(unicode) = NULL;
1916 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001917 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1918#endif
1919 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1920 }
1921 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001922 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001923 return 0;
1924}
1925
Alexander Belopolsky40018472011-02-26 01:02:56 +00001926static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001927unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001928{
Walter Dörwald16807132007-05-25 13:52:07 +00001929 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001930 case SSTATE_NOT_INTERNED:
1931 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001932
Benjamin Peterson29060642009-01-31 22:14:21 +00001933 case SSTATE_INTERNED_MORTAL:
Victor Stinnerea251802020-12-26 02:58:33 +01001934 {
1935 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner3549ca32020-07-03 16:59:12 +02001936 /* Revive the dead object temporarily. PyDict_DelItem() removes two
1937 references (key and value) which were ignored by
1938 PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2
1939 to prevent calling unicode_dealloc() again. Adjust refcnt after
1940 PyDict_DelItem(). */
1941 assert(Py_REFCNT(unicode) == 0);
1942 Py_SET_REFCNT(unicode, 3);
Victor Stinnerea251802020-12-26 02:58:33 +01001943 if (PyDict_DelItem(state->interned, unicode) != 0) {
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001944 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1945 NULL);
1946 }
Victor Stinner3549ca32020-07-03 16:59:12 +02001947 assert(Py_REFCNT(unicode) == 1);
1948 Py_SET_REFCNT(unicode, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001949 break;
Victor Stinnerea251802020-12-26 02:58:33 +01001950 }
Walter Dörwald16807132007-05-25 13:52:07 +00001951
Benjamin Peterson29060642009-01-31 22:14:21 +00001952 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001953 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1954 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001955
Benjamin Peterson29060642009-01-31 22:14:21 +00001956 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001957 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001958 }
1959
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001960 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001961 PyObject_Free(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001962 }
1963 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001964 PyObject_Free(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001965 }
1966 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001967 PyObject_Free(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001968 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001969
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001970 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971}
1972
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001973#ifdef Py_DEBUG
1974static int
1975unicode_is_singleton(PyObject *unicode)
1976{
Victor Stinner2f9ada92020-06-24 02:22:21 +02001977 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner91698d82020-06-25 14:07:40 +02001978 if (unicode == state->empty_string) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001979 return 1;
Victor Stinner607b1022020-05-05 18:50:30 +02001980 }
Victor Stinner607b1022020-05-05 18:50:30 +02001981 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001982 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1983 {
1984 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02001985 if (ch < 256 && state->latin1[ch] == unicode) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001986 return 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02001987 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001988 }
1989 return 0;
1990}
1991#endif
1992
Alexander Belopolsky40018472011-02-26 01:02:56 +00001993static int
Victor Stinner488fa492011-12-12 00:01:39 +01001994unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001995{
Victor Stinner488fa492011-12-12 00:01:39 +01001996 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001997 if (Py_REFCNT(unicode) != 1)
1998 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001999 if (_PyUnicode_HASH(unicode) != -1)
2000 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002001 if (PyUnicode_CHECK_INTERNED(unicode))
2002 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002003 if (!PyUnicode_CheckExact(unicode))
2004 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02002005#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002006 /* singleton refcount is greater than 1 */
2007 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02002008#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02002009 return 1;
2010}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002011
Victor Stinnerfe226c02011-10-03 03:52:20 +02002012static int
2013unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2014{
2015 PyObject *unicode;
2016 Py_ssize_t old_length;
2017
2018 assert(p_unicode != NULL);
2019 unicode = *p_unicode;
2020
2021 assert(unicode != NULL);
2022 assert(PyUnicode_Check(unicode));
2023 assert(0 <= length);
2024
Victor Stinner910337b2011-10-03 03:20:16 +02002025 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002026 old_length = PyUnicode_WSTR_LENGTH(unicode);
2027 else
2028 old_length = PyUnicode_GET_LENGTH(unicode);
2029 if (old_length == length)
2030 return 0;
2031
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002032 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002033 PyObject *empty = unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002034 Py_SETREF(*p_unicode, empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002035 return 0;
2036 }
2037
Victor Stinner488fa492011-12-12 00:01:39 +01002038 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002039 PyObject *copy = resize_copy(unicode, length);
2040 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002041 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002042 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002043 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002044 }
2045
Victor Stinnerfe226c02011-10-03 03:52:20 +02002046 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002047 PyObject *new_unicode = resize_compact(unicode, length);
2048 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002049 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002050 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002051 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002052 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002053 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002054}
2055
Alexander Belopolsky40018472011-02-26 01:02:56 +00002056int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002057PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002058{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002059 PyObject *unicode;
2060 if (p_unicode == NULL) {
2061 PyErr_BadInternalCall();
2062 return -1;
2063 }
2064 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002065 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002066 {
2067 PyErr_BadInternalCall();
2068 return -1;
2069 }
2070 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002071}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002072
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002073/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002074
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002075 WARNING: The function doesn't copy the terminating null character and
2076 doesn't check the maximum character (may write a latin1 character in an
2077 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002078static void
2079unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2080 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002081{
2082 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002083 const void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002084 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002085
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002086 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002087 switch (kind) {
2088 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002089#ifdef Py_DEBUG
2090 if (PyUnicode_IS_ASCII(unicode)) {
2091 Py_UCS4 maxchar = ucs1lib_find_max_char(
2092 (const Py_UCS1*)str,
2093 (const Py_UCS1*)str + len);
2094 assert(maxchar < 128);
2095 }
2096#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002097 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002098 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002099 }
2100 case PyUnicode_2BYTE_KIND: {
2101 Py_UCS2 *start = (Py_UCS2 *)data + index;
2102 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002103
Victor Stinner184252a2012-06-16 02:57:41 +02002104 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002105 *ucs2 = (Py_UCS2)*str;
2106
2107 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002108 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002109 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002110 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002111 Py_UCS4 *start = (Py_UCS4 *)data + index;
2112 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002113
Victor Stinner184252a2012-06-16 02:57:41 +02002114 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002115 *ucs4 = (Py_UCS4)*str;
2116
2117 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002118 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002119 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002120 default:
2121 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002122 }
2123}
2124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002125static PyObject*
Victor Stinner2f9ada92020-06-24 02:22:21 +02002126get_latin1_char(Py_UCS1 ch)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002127{
Victor Stinner2f9ada92020-06-24 02:22:21 +02002128 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner607b1022020-05-05 18:50:30 +02002129
Victor Stinner2f9ada92020-06-24 02:22:21 +02002130 PyObject *unicode = state->latin1[ch];
Victor Stinner607b1022020-05-05 18:50:30 +02002131 if (unicode) {
2132 Py_INCREF(unicode);
2133 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002134 }
Victor Stinner607b1022020-05-05 18:50:30 +02002135
2136 unicode = PyUnicode_New(1, ch);
2137 if (!unicode) {
2138 return NULL;
2139 }
2140
2141 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2142 assert(_PyUnicode_CheckConsistency(unicode, 1));
2143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002144 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002145 state->latin1[ch] = unicode;
Victor Stinnera464fc12011-10-02 20:39:30 +02002146 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002147}
2148
Victor Stinner985a82a2014-01-03 12:53:47 +01002149static PyObject*
2150unicode_char(Py_UCS4 ch)
2151{
2152 PyObject *unicode;
2153
2154 assert(ch <= MAX_UNICODE);
2155
Victor Stinner2f9ada92020-06-24 02:22:21 +02002156 if (ch < 256) {
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002157 return get_latin1_char(ch);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002158 }
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002159
Victor Stinner985a82a2014-01-03 12:53:47 +01002160 unicode = PyUnicode_New(1, ch);
2161 if (unicode == NULL)
2162 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002163
2164 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2165 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002166 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002167 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002168 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2169 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2170 }
2171 assert(_PyUnicode_CheckConsistency(unicode, 1));
2172 return unicode;
2173}
2174
Alexander Belopolsky40018472011-02-26 01:02:56 +00002175PyObject *
2176PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002177{
Inada Naoki038dd0f2020-06-30 15:26:56 +09002178 if (u == NULL) {
2179 if (size > 0) {
2180 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2181 "PyUnicode_FromUnicode(NULL, size) is deprecated; "
2182 "use PyUnicode_New() instead", 1) < 0) {
2183 return NULL;
2184 }
2185 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002186 return (PyObject*)_PyUnicode_New(size);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002187 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002188
2189 if (size < 0) {
2190 PyErr_BadInternalCall();
2191 return NULL;
2192 }
2193
2194 return PyUnicode_FromWideChar(u, size);
2195}
2196
2197PyObject *
2198PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2199{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002200 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002201 Py_UCS4 maxchar = 0;
2202 Py_ssize_t num_surrogates;
2203
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002204 if (u == NULL && size != 0) {
2205 PyErr_BadInternalCall();
2206 return NULL;
2207 }
2208
2209 if (size == -1) {
2210 size = wcslen(u);
2211 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002212
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002213 /* If the Unicode data is known at construction time, we can apply
2214 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002215
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002216 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002217 if (size == 0)
2218 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 /* Single character Unicode objects in the Latin-1 range are
2221 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002222 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002223 return get_latin1_char((unsigned char)*u);
2224
2225 /* If not empty and not single character, copy the Unicode data
2226 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002227 if (find_maxchar_surrogates(u, u + size,
2228 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002229 return NULL;
2230
Victor Stinner8faf8212011-12-08 22:14:11 +01002231 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232 if (!unicode)
2233 return NULL;
2234
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002235 switch (PyUnicode_KIND(unicode)) {
2236 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002237 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002238 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2239 break;
2240 case PyUnicode_2BYTE_KIND:
2241#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002242 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002243#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002244 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002245 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2246#endif
2247 break;
2248 case PyUnicode_4BYTE_KIND:
2249#if SIZEOF_WCHAR_T == 2
2250 /* This is the only case which has to process surrogates, thus
2251 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002252 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002253#else
2254 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002255 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002256#endif
2257 break;
2258 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002259 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002260 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002261
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002262 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263}
2264
Alexander Belopolsky40018472011-02-26 01:02:56 +00002265PyObject *
2266PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002267{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002268 if (size < 0) {
2269 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002270 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002271 return NULL;
2272 }
Inada Naoki038dd0f2020-06-30 15:26:56 +09002273 if (u != NULL) {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002274 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002275 }
2276 else {
2277 if (size > 0) {
2278 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2279 "PyUnicode_FromStringAndSize(NULL, size) is deprecated; "
2280 "use PyUnicode_New() instead", 1) < 0) {
2281 return NULL;
2282 }
2283 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002284 return (PyObject *)_PyUnicode_New(size);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002285 }
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002286}
2287
Alexander Belopolsky40018472011-02-26 01:02:56 +00002288PyObject *
2289PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002290{
2291 size_t size = strlen(u);
2292 if (size > PY_SSIZE_T_MAX) {
2293 PyErr_SetString(PyExc_OverflowError, "input too long");
2294 return NULL;
2295 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002296 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002297}
2298
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002299
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002300PyObject *
2301_PyUnicode_FromId(_Py_Identifier *id)
2302{
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002303 PyInterpreterState *interp = _PyInterpreterState_GET();
2304 struct _Py_unicode_ids *ids = &interp->unicode.ids;
2305
Pablo Galindoa6d63a22020-12-29 00:28:09 +00002306 Py_ssize_t index = _Py_atomic_size_get(&id->index);
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002307 if (index < 0) {
2308 struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_ids;
2309
2310 PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK);
2311 // Check again to detect concurrent access. Another thread can have
2312 // initialized the index while this thread waited for the lock.
2313 index = _Py_atomic_size_get(&id->index);
2314 if (index < 0) {
2315 assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2316 index = rt_ids->next_index;
2317 rt_ids->next_index++;
2318 _Py_atomic_size_set(&id->index, index);
2319 }
2320 PyThread_release_lock(rt_ids->lock);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002321 }
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002322 assert(index >= 0);
Victor Stinner297257f2020-06-02 14:39:45 +02002323
2324 PyObject *obj;
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002325 if (index < ids->size) {
2326 obj = ids->array[index];
2327 if (obj) {
2328 // Return a borrowed reference
2329 return obj;
2330 }
2331 }
2332
2333 obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
Victor Stinner297257f2020-06-02 14:39:45 +02002334 NULL, NULL);
2335 if (!obj) {
2336 return NULL;
2337 }
2338 PyUnicode_InternInPlace(&obj);
2339
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002340 if (index >= ids->size) {
2341 // Overallocate to reduce the number of realloc
2342 Py_ssize_t new_size = Py_MAX(index * 2, 16);
2343 Py_ssize_t item_size = sizeof(ids->array[0]);
2344 PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2345 if (new_array == NULL) {
2346 PyErr_NoMemory();
2347 return NULL;
2348 }
2349 memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2350 ids->array = new_array;
2351 ids->size = new_size;
2352 }
2353
2354 // The array stores a strong reference
2355 ids->array[index] = obj;
2356
2357 // Return a borrowed reference
2358 return obj;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002359}
2360
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002361
Victor Stinnerd6fb53f2020-05-14 01:11:54 +02002362static void
Victor Stinnerf4507232020-12-26 20:26:08 +01002363unicode_clear_identifiers(struct _Py_unicode_state *state)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002364{
Victor Stinnerf4507232020-12-26 20:26:08 +01002365 struct _Py_unicode_ids *ids = &state->ids;
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002366 for (Py_ssize_t i=0; i < ids->size; i++) {
2367 Py_XDECREF(ids->array[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002368 }
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002369 ids->size = 0;
2370 PyMem_Free(ids->array);
2371 ids->array = NULL;
2372 // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2373 // after Py_Finalize().
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002374}
2375
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002376
Benjamin Peterson0df54292012-03-26 14:50:32 -04002377/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002378
Victor Stinnerd3f08822012-05-29 12:57:52 +02002379PyObject*
2380_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002381{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002382 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002383 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002384 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002385#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002386 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002387#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002388 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002389 }
Victor Stinner785938e2011-12-11 20:09:03 +01002390 unicode = PyUnicode_New(size, 127);
2391 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002392 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002393 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2394 assert(_PyUnicode_CheckConsistency(unicode, 1));
2395 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002396}
2397
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002398static Py_UCS4
2399kind_maxchar_limit(unsigned int kind)
2400{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002401 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002402 case PyUnicode_1BYTE_KIND:
2403 return 0x80;
2404 case PyUnicode_2BYTE_KIND:
2405 return 0x100;
2406 case PyUnicode_4BYTE_KIND:
2407 return 0x10000;
2408 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002409 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002410 }
2411}
2412
Victor Stinner702c7342011-10-05 13:50:52 +02002413static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002414_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002415{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002416 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002417 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002418
Victor Stinner2f9ada92020-06-24 02:22:21 +02002419 if (size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02002420 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner2f9ada92020-06-24 02:22:21 +02002421 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002422 assert(size > 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002423 if (size == 1) {
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002424 return get_latin1_char(u[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002425 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002426
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002427 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002428 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002429 if (!res)
2430 return NULL;
2431 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002432 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002433 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002434}
2435
Victor Stinnere57b1c02011-09-28 22:20:48 +02002436static PyObject*
2437_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002438{
2439 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002440 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002441
Serhiy Storchaka678db842013-01-26 12:16:36 +02002442 if (size == 0)
2443 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002444 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002445 if (size == 1)
2446 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002447
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002448 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002449 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002450 if (!res)
2451 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002452 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002453 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002454 else {
2455 _PyUnicode_CONVERT_BYTES(
2456 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2457 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002458 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002459 return res;
2460}
2461
Victor Stinnere57b1c02011-09-28 22:20:48 +02002462static PyObject*
2463_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002464{
2465 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002466 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002467
Serhiy Storchaka678db842013-01-26 12:16:36 +02002468 if (size == 0)
2469 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002470 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002471 if (size == 1)
2472 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002473
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002474 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002475 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002476 if (!res)
2477 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002478 if (max_char < 256)
2479 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2480 PyUnicode_1BYTE_DATA(res));
2481 else if (max_char < 0x10000)
2482 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2483 PyUnicode_2BYTE_DATA(res));
2484 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002485 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002486 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002487 return res;
2488}
2489
2490PyObject*
2491PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2492{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002493 if (size < 0) {
2494 PyErr_SetString(PyExc_ValueError, "size must be positive");
2495 return NULL;
2496 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002497 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002498 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002499 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002500 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002501 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002502 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002503 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002504 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002505 PyErr_SetString(PyExc_SystemError, "invalid kind");
2506 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002507 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002508}
2509
Victor Stinnerece58de2012-04-23 23:36:38 +02002510Py_UCS4
2511_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2512{
2513 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002514 const void *startptr, *endptr;
Victor Stinnerece58de2012-04-23 23:36:38 +02002515
2516 assert(PyUnicode_IS_READY(unicode));
2517 assert(0 <= start);
2518 assert(end <= PyUnicode_GET_LENGTH(unicode));
2519 assert(start <= end);
2520
2521 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2522 return PyUnicode_MAX_CHAR_VALUE(unicode);
2523
2524 if (start == end)
2525 return 127;
2526
Victor Stinner94d558b2012-04-27 22:26:58 +02002527 if (PyUnicode_IS_ASCII(unicode))
2528 return 127;
2529
Victor Stinnerece58de2012-04-23 23:36:38 +02002530 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002531 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002532 endptr = (char *)startptr + end * kind;
2533 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002534 switch(kind) {
2535 case PyUnicode_1BYTE_KIND:
2536 return ucs1lib_find_max_char(startptr, endptr);
2537 case PyUnicode_2BYTE_KIND:
2538 return ucs2lib_find_max_char(startptr, endptr);
2539 case PyUnicode_4BYTE_KIND:
2540 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002541 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002542 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002543 }
2544}
2545
Victor Stinner25a4b292011-10-06 12:31:55 +02002546/* Ensure that a string uses the most efficient storage, if it is not the
2547 case: create a new string with of the right kind. Write NULL into *p_unicode
2548 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002549static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002550unicode_adjust_maxchar(PyObject **p_unicode)
2551{
2552 PyObject *unicode, *copy;
2553 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002554 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002555 unsigned int kind;
2556
2557 assert(p_unicode != NULL);
2558 unicode = *p_unicode;
2559 assert(PyUnicode_IS_READY(unicode));
2560 if (PyUnicode_IS_ASCII(unicode))
2561 return;
2562
2563 len = PyUnicode_GET_LENGTH(unicode);
2564 kind = PyUnicode_KIND(unicode);
2565 if (kind == PyUnicode_1BYTE_KIND) {
2566 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002567 max_char = ucs1lib_find_max_char(u, u + len);
2568 if (max_char >= 128)
2569 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002570 }
2571 else if (kind == PyUnicode_2BYTE_KIND) {
2572 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002573 max_char = ucs2lib_find_max_char(u, u + len);
2574 if (max_char >= 256)
2575 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002576 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002577 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002578 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002579 max_char = ucs4lib_find_max_char(u, u + len);
2580 if (max_char >= 0x10000)
2581 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002582 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002583 else
2584 Py_UNREACHABLE();
2585
Victor Stinner25a4b292011-10-06 12:31:55 +02002586 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002587 if (copy != NULL)
2588 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002589 Py_DECREF(unicode);
2590 *p_unicode = copy;
2591}
2592
Victor Stinner034f6cf2011-09-30 02:26:44 +02002593PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002594_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002595{
Victor Stinner87af4f22011-11-21 23:03:47 +01002596 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002597 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002598
Victor Stinner034f6cf2011-09-30 02:26:44 +02002599 if (!PyUnicode_Check(unicode)) {
2600 PyErr_BadInternalCall();
2601 return NULL;
2602 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002603 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002604 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002605
Victor Stinner87af4f22011-11-21 23:03:47 +01002606 length = PyUnicode_GET_LENGTH(unicode);
2607 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002608 if (!copy)
2609 return NULL;
2610 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2611
Christian Heimesf051e432016-09-13 20:22:02 +02002612 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002613 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002614 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002615 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002616}
2617
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002618
Victor Stinnerbc603d12011-10-02 01:00:40 +02002619/* Widen Unicode objects to larger buffers. Don't write terminating null
2620 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002621
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002622static void*
2623unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002624{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002625 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002626
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002627 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002628 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002629 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002630 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002631 if (!result)
2632 return PyErr_NoMemory();
2633 assert(skind == PyUnicode_1BYTE_KIND);
2634 _PyUnicode_CONVERT_BYTES(
2635 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002636 (const Py_UCS1 *)data,
2637 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002638 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002639 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002640 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002641 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002642 if (!result)
2643 return PyErr_NoMemory();
2644 if (skind == PyUnicode_2BYTE_KIND) {
2645 _PyUnicode_CONVERT_BYTES(
2646 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002647 (const Py_UCS2 *)data,
2648 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002649 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002650 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002651 else {
2652 assert(skind == PyUnicode_1BYTE_KIND);
2653 _PyUnicode_CONVERT_BYTES(
2654 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002655 (const Py_UCS1 *)data,
2656 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002657 result);
2658 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002659 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002660 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002661 Py_UNREACHABLE();
2662 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002663 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002664}
2665
2666static Py_UCS4*
2667as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2668 int copy_null)
2669{
2670 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002671 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002672 Py_ssize_t len, targetlen;
2673 if (PyUnicode_READY(string) == -1)
2674 return NULL;
2675 kind = PyUnicode_KIND(string);
2676 data = PyUnicode_DATA(string);
2677 len = PyUnicode_GET_LENGTH(string);
2678 targetlen = len;
2679 if (copy_null)
2680 targetlen++;
2681 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002682 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002683 if (!target) {
2684 PyErr_NoMemory();
2685 return NULL;
2686 }
2687 }
2688 else {
2689 if (targetsize < targetlen) {
2690 PyErr_Format(PyExc_SystemError,
2691 "string is longer than the buffer");
2692 if (copy_null && 0 < targetsize)
2693 target[0] = 0;
2694 return NULL;
2695 }
2696 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002697 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002698 const Py_UCS1 *start = (const Py_UCS1 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002699 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002700 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002701 else if (kind == PyUnicode_2BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002702 const Py_UCS2 *start = (const Py_UCS2 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002703 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2704 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002705 else if (kind == PyUnicode_4BYTE_KIND) {
Christian Heimesf051e432016-09-13 20:22:02 +02002706 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002707 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002708 else {
2709 Py_UNREACHABLE();
2710 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002711 if (copy_null)
2712 target[len] = 0;
2713 return target;
2714}
2715
2716Py_UCS4*
2717PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2718 int copy_null)
2719{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002720 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002721 PyErr_BadInternalCall();
2722 return NULL;
2723 }
2724 return as_ucs4(string, target, targetsize, copy_null);
2725}
2726
2727Py_UCS4*
2728PyUnicode_AsUCS4Copy(PyObject *string)
2729{
2730 return as_ucs4(string, NULL, 0, 1);
2731}
2732
Victor Stinner15a11362012-10-06 23:48:20 +02002733/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002734 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2735 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2736#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002737
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002738static int
2739unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2740 Py_ssize_t width, Py_ssize_t precision)
2741{
2742 Py_ssize_t length, fill, arglen;
2743 Py_UCS4 maxchar;
2744
2745 if (PyUnicode_READY(str) == -1)
2746 return -1;
2747
2748 length = PyUnicode_GET_LENGTH(str);
2749 if ((precision == -1 || precision >= length)
2750 && width <= length)
2751 return _PyUnicodeWriter_WriteStr(writer, str);
2752
2753 if (precision != -1)
2754 length = Py_MIN(precision, length);
2755
2756 arglen = Py_MAX(length, width);
2757 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2758 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2759 else
2760 maxchar = writer->maxchar;
2761
2762 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2763 return -1;
2764
2765 if (width > length) {
2766 fill = width - length;
2767 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2768 return -1;
2769 writer->pos += fill;
2770 }
2771
2772 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2773 str, 0, length);
2774 writer->pos += length;
2775 return 0;
2776}
2777
2778static int
Victor Stinner998b8062018-09-12 00:23:25 +02002779unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002780 Py_ssize_t width, Py_ssize_t precision)
2781{
2782 /* UTF-8 */
2783 Py_ssize_t length;
2784 PyObject *unicode;
2785 int res;
2786
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002787 if (precision == -1) {
2788 length = strlen(str);
2789 }
2790 else {
2791 length = 0;
2792 while (length < precision && str[length]) {
2793 length++;
2794 }
2795 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002796 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2797 if (unicode == NULL)
2798 return -1;
2799
2800 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2801 Py_DECREF(unicode);
2802 return res;
2803}
2804
Victor Stinner96865452011-03-01 23:44:09 +00002805static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002806unicode_fromformat_arg(_PyUnicodeWriter *writer,
2807 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002808{
Victor Stinnere215d962012-10-06 23:03:36 +02002809 const char *p;
2810 Py_ssize_t len;
2811 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002812 Py_ssize_t width;
2813 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002814 int longflag;
2815 int longlongflag;
2816 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002817 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002818
2819 p = f;
2820 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002821 zeropad = 0;
2822 if (*f == '0') {
2823 zeropad = 1;
2824 f++;
2825 }
Victor Stinner96865452011-03-01 23:44:09 +00002826
2827 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002828 width = -1;
2829 if (Py_ISDIGIT((unsigned)*f)) {
2830 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002831 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002832 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002833 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002834 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002835 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002836 return NULL;
2837 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002838 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002839 f++;
2840 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002841 }
2842 precision = -1;
2843 if (*f == '.') {
2844 f++;
2845 if (Py_ISDIGIT((unsigned)*f)) {
2846 precision = (*f - '0');
2847 f++;
2848 while (Py_ISDIGIT((unsigned)*f)) {
2849 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2850 PyErr_SetString(PyExc_ValueError,
2851 "precision too big");
2852 return NULL;
2853 }
2854 precision = (precision * 10) + (*f - '0');
2855 f++;
2856 }
2857 }
Victor Stinner96865452011-03-01 23:44:09 +00002858 if (*f == '%') {
2859 /* "%.3%s" => f points to "3" */
2860 f--;
2861 }
2862 }
2863 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002864 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002865 f--;
2866 }
Victor Stinner96865452011-03-01 23:44:09 +00002867
2868 /* Handle %ld, %lu, %lld and %llu. */
2869 longflag = 0;
2870 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002871 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002872 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002873 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002874 longflag = 1;
2875 ++f;
2876 }
Victor Stinner96865452011-03-01 23:44:09 +00002877 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002878 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002879 longlongflag = 1;
2880 f += 2;
2881 }
Victor Stinner96865452011-03-01 23:44:09 +00002882 }
2883 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002884 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002885 size_tflag = 1;
2886 ++f;
2887 }
Victor Stinnere215d962012-10-06 23:03:36 +02002888
2889 if (f[1] == '\0')
2890 writer->overallocate = 0;
2891
2892 switch (*f) {
2893 case 'c':
2894 {
2895 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002896 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002897 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002898 "character argument not in range(0x110000)");
2899 return NULL;
2900 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002901 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002902 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002903 break;
2904 }
2905
2906 case 'i':
2907 case 'd':
2908 case 'u':
2909 case 'x':
2910 {
2911 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002912 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002913 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002914
2915 if (*f == 'u') {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002916 if (longflag) {
2917 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2918 }
2919 else if (longlongflag) {
2920 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2921 }
2922 else if (size_tflag) {
2923 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2924 }
2925 else {
2926 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2927 }
Victor Stinnere215d962012-10-06 23:03:36 +02002928 }
2929 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002930 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002931 }
2932 else {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002933 if (longflag) {
2934 len = sprintf(buffer, "%li", va_arg(*vargs, long));
2935 }
2936 else if (longlongflag) {
2937 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2938 }
2939 else if (size_tflag) {
2940 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2941 }
2942 else {
2943 len = sprintf(buffer, "%i", va_arg(*vargs, int));
2944 }
Victor Stinnere215d962012-10-06 23:03:36 +02002945 }
2946 assert(len >= 0);
2947
Victor Stinnere215d962012-10-06 23:03:36 +02002948 if (precision < len)
2949 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002950
2951 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002952 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2953 return NULL;
2954
Victor Stinnere215d962012-10-06 23:03:36 +02002955 if (width > precision) {
2956 Py_UCS4 fillchar;
2957 fill = width - precision;
2958 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002959 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2960 return NULL;
2961 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002962 }
Victor Stinner15a11362012-10-06 23:48:20 +02002963 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002964 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002965 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2966 return NULL;
2967 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002968 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002969
Victor Stinner4a587072013-11-19 12:54:53 +01002970 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2971 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002972 break;
2973 }
2974
2975 case 'p':
2976 {
2977 char number[MAX_LONG_LONG_CHARS];
2978
2979 len = sprintf(number, "%p", va_arg(*vargs, void*));
2980 assert(len >= 0);
2981
2982 /* %p is ill-defined: ensure leading 0x. */
2983 if (number[1] == 'X')
2984 number[1] = 'x';
2985 else if (number[1] != 'x') {
2986 memmove(number + 2, number,
2987 strlen(number) + 1);
2988 number[0] = '0';
2989 number[1] = 'x';
2990 len += 2;
2991 }
2992
Victor Stinner4a587072013-11-19 12:54:53 +01002993 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002994 return NULL;
2995 break;
2996 }
2997
2998 case 's':
2999 {
3000 /* UTF-8 */
3001 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02003002 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003003 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003004 break;
3005 }
3006
3007 case 'U':
3008 {
3009 PyObject *obj = va_arg(*vargs, PyObject *);
3010 assert(obj && _PyUnicode_CHECK(obj));
3011
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003012 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003013 return NULL;
3014 break;
3015 }
3016
3017 case 'V':
3018 {
3019 PyObject *obj = va_arg(*vargs, PyObject *);
3020 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02003021 if (obj) {
3022 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003023 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003024 return NULL;
3025 }
3026 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003027 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02003028 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003029 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003030 }
3031 break;
3032 }
3033
3034 case 'S':
3035 {
3036 PyObject *obj = va_arg(*vargs, PyObject *);
3037 PyObject *str;
3038 assert(obj);
3039 str = PyObject_Str(obj);
3040 if (!str)
3041 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003042 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003043 Py_DECREF(str);
3044 return NULL;
3045 }
3046 Py_DECREF(str);
3047 break;
3048 }
3049
3050 case 'R':
3051 {
3052 PyObject *obj = va_arg(*vargs, PyObject *);
3053 PyObject *repr;
3054 assert(obj);
3055 repr = PyObject_Repr(obj);
3056 if (!repr)
3057 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003058 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003059 Py_DECREF(repr);
3060 return NULL;
3061 }
3062 Py_DECREF(repr);
3063 break;
3064 }
3065
3066 case 'A':
3067 {
3068 PyObject *obj = va_arg(*vargs, PyObject *);
3069 PyObject *ascii;
3070 assert(obj);
3071 ascii = PyObject_ASCII(obj);
3072 if (!ascii)
3073 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003074 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003075 Py_DECREF(ascii);
3076 return NULL;
3077 }
3078 Py_DECREF(ascii);
3079 break;
3080 }
3081
3082 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02003083 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003084 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003085 break;
3086
3087 default:
3088 /* if we stumble upon an unknown formatting code, copy the rest
3089 of the format string to the output string. (we cannot just
3090 skip the code, since there's no way to know what's in the
3091 argument list) */
3092 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01003093 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003094 return NULL;
3095 f = p+len;
3096 return f;
3097 }
3098
3099 f++;
Victor Stinner96865452011-03-01 23:44:09 +00003100 return f;
3101}
3102
Walter Dörwaldd2034312007-05-18 16:29:38 +00003103PyObject *
3104PyUnicode_FromFormatV(const char *format, va_list vargs)
3105{
Victor Stinnere215d962012-10-06 23:03:36 +02003106 va_list vargs2;
3107 const char *f;
3108 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003109
Victor Stinner8f674cc2013-04-17 23:02:17 +02003110 _PyUnicodeWriter_Init(&writer);
3111 writer.min_length = strlen(format) + 100;
3112 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003113
Benjamin Peterson0c212142016-09-20 20:39:33 -07003114 // Copy varags to be able to pass a reference to a subfunction.
3115 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003116
3117 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003118 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003119 f = unicode_fromformat_arg(&writer, f, &vargs2);
3120 if (f == NULL)
3121 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003122 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003123 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003124 const char *p;
3125 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003126
Victor Stinnere215d962012-10-06 23:03:36 +02003127 p = f;
3128 do
3129 {
3130 if ((unsigned char)*p > 127) {
3131 PyErr_Format(PyExc_ValueError,
3132 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3133 "string, got a non-ASCII byte: 0x%02x",
3134 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003135 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003136 }
3137 p++;
3138 }
3139 while (*p != '\0' && *p != '%');
3140 len = p - f;
3141
3142 if (*p == '\0')
3143 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003144
3145 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003146 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003147
3148 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003149 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003150 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003151 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003152 return _PyUnicodeWriter_Finish(&writer);
3153
3154 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003155 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003156 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003157 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003158}
3159
Walter Dörwaldd2034312007-05-18 16:29:38 +00003160PyObject *
3161PyUnicode_FromFormat(const char *format, ...)
3162{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003163 PyObject* ret;
3164 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003165
3166#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003167 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003168#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003169 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003170#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003171 ret = PyUnicode_FromFormatV(format, vargs);
3172 va_end(vargs);
3173 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003174}
3175
Serhiy Storchakac46db922018-10-23 22:58:24 +03003176static Py_ssize_t
3177unicode_get_widechar_size(PyObject *unicode)
3178{
3179 Py_ssize_t res;
3180
3181 assert(unicode != NULL);
3182 assert(_PyUnicode_CHECK(unicode));
3183
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003184#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchakac46db922018-10-23 22:58:24 +03003185 if (_PyUnicode_WSTR(unicode) != NULL) {
3186 return PyUnicode_WSTR_LENGTH(unicode);
3187 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003188#endif /* USE_UNICODE_WCHAR_CACHE */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003189 assert(PyUnicode_IS_READY(unicode));
3190
3191 res = _PyUnicode_LENGTH(unicode);
3192#if SIZEOF_WCHAR_T == 2
3193 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3194 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3195 const Py_UCS4 *end = s + res;
3196 for (; s < end; ++s) {
3197 if (*s > 0xFFFF) {
3198 ++res;
3199 }
3200 }
3201 }
3202#endif
3203 return res;
3204}
3205
3206static void
3207unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3208{
Serhiy Storchakac46db922018-10-23 22:58:24 +03003209 assert(unicode != NULL);
3210 assert(_PyUnicode_CHECK(unicode));
3211
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003212#if USE_UNICODE_WCHAR_CACHE
3213 const wchar_t *wstr = _PyUnicode_WSTR(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03003214 if (wstr != NULL) {
3215 memcpy(w, wstr, size * sizeof(wchar_t));
3216 return;
3217 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003218#else /* USE_UNICODE_WCHAR_CACHE */
3219 if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3220 memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3221 return;
3222 }
3223#endif /* USE_UNICODE_WCHAR_CACHE */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003224 assert(PyUnicode_IS_READY(unicode));
3225
3226 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3227 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3228 for (; size--; ++s, ++w) {
3229 *w = *s;
3230 }
3231 }
3232 else {
3233#if SIZEOF_WCHAR_T == 4
3234 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3235 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3236 for (; size--; ++s, ++w) {
3237 *w = *s;
3238 }
3239#else
3240 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3241 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3242 for (; size--; ++s, ++w) {
3243 Py_UCS4 ch = *s;
3244 if (ch > 0xFFFF) {
3245 assert(ch <= MAX_UNICODE);
3246 /* encode surrogate pair in this case */
3247 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3248 if (!size--)
3249 break;
3250 *w = Py_UNICODE_LOW_SURROGATE(ch);
3251 }
3252 else {
3253 *w = ch;
3254 }
3255 }
3256#endif
3257 }
3258}
3259
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003260#ifdef HAVE_WCHAR_H
3261
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003262/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003263
Victor Stinnerd88d9832011-09-06 02:00:05 +02003264 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003265 character) required to convert the unicode object. Ignore size argument.
3266
Victor Stinnerd88d9832011-09-06 02:00:05 +02003267 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003268 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003269 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003270Py_ssize_t
3271PyUnicode_AsWideChar(PyObject *unicode,
3272 wchar_t *w,
3273 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003274{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003275 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003276
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003277 if (unicode == NULL) {
3278 PyErr_BadInternalCall();
3279 return -1;
3280 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003281 if (!PyUnicode_Check(unicode)) {
3282 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003283 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003284 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003285
3286 res = unicode_get_widechar_size(unicode);
3287 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003288 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003289 }
3290
3291 if (size > res) {
3292 size = res + 1;
3293 }
3294 else {
3295 res = size;
3296 }
3297 unicode_copy_as_widechar(unicode, w, size);
3298 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003299}
3300
Victor Stinner137c34c2010-09-29 10:25:54 +00003301wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003302PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003303 Py_ssize_t *size)
3304{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003305 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003306 Py_ssize_t buflen;
3307
3308 if (unicode == NULL) {
3309 PyErr_BadInternalCall();
3310 return NULL;
3311 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003312 if (!PyUnicode_Check(unicode)) {
3313 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003314 return NULL;
3315 }
3316
Serhiy Storchakac46db922018-10-23 22:58:24 +03003317 buflen = unicode_get_widechar_size(unicode);
3318 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003319 if (buffer == NULL) {
3320 PyErr_NoMemory();
3321 return NULL;
3322 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003323 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3324 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003325 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003326 }
3327 else if (wcslen(buffer) != (size_t)buflen) {
Victor Stinner00d7abd2020-12-01 09:56:42 +01003328 PyMem_Free(buffer);
Serhiy Storchakac46db922018-10-23 22:58:24 +03003329 PyErr_SetString(PyExc_ValueError,
3330 "embedded null character");
3331 return NULL;
3332 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003333 return buffer;
3334}
3335
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003336#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003338int
3339_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3340{
3341 wchar_t **p = (wchar_t **)ptr;
3342 if (obj == NULL) {
3343#if !USE_UNICODE_WCHAR_CACHE
3344 PyMem_Free(*p);
3345#endif /* USE_UNICODE_WCHAR_CACHE */
3346 *p = NULL;
3347 return 1;
3348 }
3349 if (PyUnicode_Check(obj)) {
3350#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003351 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3352 if (*p == NULL) {
3353 return 0;
3354 }
3355 return 1;
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003356#else /* USE_UNICODE_WCHAR_CACHE */
3357 *p = PyUnicode_AsWideCharString(obj, NULL);
3358 if (*p == NULL) {
3359 return 0;
3360 }
3361 return Py_CLEANUP_SUPPORTED;
3362#endif /* USE_UNICODE_WCHAR_CACHE */
3363 }
3364 PyErr_Format(PyExc_TypeError,
3365 "argument must be str, not %.50s",
Victor Stinner8182cc22020-07-10 12:40:38 +02003366 Py_TYPE(obj)->tp_name);
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003367 return 0;
3368}
3369
3370int
3371_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3372{
3373 wchar_t **p = (wchar_t **)ptr;
3374 if (obj == NULL) {
3375#if !USE_UNICODE_WCHAR_CACHE
3376 PyMem_Free(*p);
3377#endif /* USE_UNICODE_WCHAR_CACHE */
3378 *p = NULL;
3379 return 1;
3380 }
3381 if (obj == Py_None) {
3382 *p = NULL;
3383 return 1;
3384 }
3385 if (PyUnicode_Check(obj)) {
3386#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003387 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3388 if (*p == NULL) {
3389 return 0;
3390 }
3391 return 1;
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003392#else /* USE_UNICODE_WCHAR_CACHE */
3393 *p = PyUnicode_AsWideCharString(obj, NULL);
3394 if (*p == NULL) {
3395 return 0;
3396 }
3397 return Py_CLEANUP_SUPPORTED;
3398#endif /* USE_UNICODE_WCHAR_CACHE */
3399 }
3400 PyErr_Format(PyExc_TypeError,
3401 "argument must be str or None, not %.50s",
Victor Stinner8182cc22020-07-10 12:40:38 +02003402 Py_TYPE(obj)->tp_name);
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003403 return 0;
3404}
3405
Alexander Belopolsky40018472011-02-26 01:02:56 +00003406PyObject *
3407PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003408{
Victor Stinner8faf8212011-12-08 22:14:11 +01003409 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003410 PyErr_SetString(PyExc_ValueError,
3411 "chr() arg not in range(0x110000)");
3412 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003413 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003414
Victor Stinner985a82a2014-01-03 12:53:47 +01003415 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003416}
3417
Alexander Belopolsky40018472011-02-26 01:02:56 +00003418PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003419PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003420{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003421 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003422 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003423 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003424 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003425 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003426 Py_INCREF(obj);
3427 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003428 }
3429 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003430 /* For a Unicode subtype that's not a Unicode object,
3431 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003432 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003433 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003434 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003435 "Can't convert '%.100s' object to str implicitly",
3436 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003437 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003438}
3439
Alexander Belopolsky40018472011-02-26 01:02:56 +00003440PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003441PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003442 const char *encoding,
3443 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003444{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003445 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003446 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003447
Guido van Rossumd57fd912000-03-10 22:53:23 +00003448 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003449 PyErr_BadInternalCall();
3450 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003451 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003452
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003453 /* Decoding bytes objects is the most common case and should be fast */
3454 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003455 if (PyBytes_GET_SIZE(obj) == 0) {
3456 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3457 return NULL;
3458 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003459 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003460 }
3461 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003462 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3463 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003464 }
3465
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003466 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003467 PyErr_SetString(PyExc_TypeError,
3468 "decoding str is not supported");
3469 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003470 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003471
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003472 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3473 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3474 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003475 "decoding to str: need a bytes-like object, %.80s found",
3476 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003477 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003478 }
Tim Petersced69f82003-09-16 20:30:58 +00003479
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003480 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003481 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003482 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3483 return NULL;
3484 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003485 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003486 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003487
Serhiy Storchaka05997252013-01-26 12:14:02 +02003488 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003489 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003490 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003491}
3492
Victor Stinnerebe17e02016-10-12 13:57:45 +02003493/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3494 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3495 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003496int
3497_Py_normalize_encoding(const char *encoding,
3498 char *lower,
3499 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003500{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003501 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003502 char *l;
3503 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003504 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003505
Victor Stinner942889a2016-09-05 15:40:10 -07003506 assert(encoding != NULL);
3507
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003508 e = encoding;
3509 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003510 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003511 punct = 0;
3512 while (1) {
3513 char c = *e;
3514 if (c == 0) {
3515 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003516 }
Victor Stinner942889a2016-09-05 15:40:10 -07003517
3518 if (Py_ISALNUM(c) || c == '.') {
3519 if (punct && l != lower) {
3520 if (l == l_end) {
3521 return 0;
3522 }
3523 *l++ = '_';
3524 }
3525 punct = 0;
3526
3527 if (l == l_end) {
3528 return 0;
3529 }
3530 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003531 }
3532 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003533 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003534 }
Victor Stinner942889a2016-09-05 15:40:10 -07003535
3536 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003537 }
3538 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003539 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003540}
3541
Alexander Belopolsky40018472011-02-26 01:02:56 +00003542PyObject *
3543PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003544 Py_ssize_t size,
3545 const char *encoding,
3546 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003547{
3548 PyObject *buffer = NULL, *unicode;
3549 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003550 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3551
Victor Stinner22eb6892019-06-26 00:51:05 +02003552 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3553 return NULL;
3554 }
3555
Victor Stinnered076ed2019-06-26 01:49:32 +02003556 if (size == 0) {
3557 _Py_RETURN_UNICODE_EMPTY();
3558 }
3559
Victor Stinner942889a2016-09-05 15:40:10 -07003560 if (encoding == NULL) {
3561 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3562 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003563
Fred Drakee4315f52000-05-09 19:53:39 +00003564 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003565 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3566 char *lower = buflower;
3567
3568 /* Fast paths */
3569 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3570 lower += 3;
3571 if (*lower == '_') {
3572 /* Match "utf8" and "utf_8" */
3573 lower++;
3574 }
3575
3576 if (lower[0] == '8' && lower[1] == 0) {
3577 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3578 }
3579 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3580 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3581 }
3582 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3583 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3584 }
3585 }
3586 else {
3587 if (strcmp(lower, "ascii") == 0
3588 || strcmp(lower, "us_ascii") == 0) {
3589 return PyUnicode_DecodeASCII(s, size, errors);
3590 }
Steve Dowercc16be82016-09-08 10:35:16 -07003591 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003592 else if (strcmp(lower, "mbcs") == 0) {
3593 return PyUnicode_DecodeMBCS(s, size, errors);
3594 }
3595 #endif
3596 else if (strcmp(lower, "latin1") == 0
3597 || strcmp(lower, "latin_1") == 0
3598 || strcmp(lower, "iso_8859_1") == 0
3599 || strcmp(lower, "iso8859_1") == 0) {
3600 return PyUnicode_DecodeLatin1(s, size, errors);
3601 }
3602 }
Victor Stinner37296e82010-06-10 13:36:23 +00003603 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604
3605 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003606 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003607 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003608 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003609 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003610 if (buffer == NULL)
3611 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003612 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003613 if (unicode == NULL)
3614 goto onError;
3615 if (!PyUnicode_Check(unicode)) {
3616 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003617 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003618 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003619 encoding,
3620 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003621 Py_DECREF(unicode);
3622 goto onError;
3623 }
3624 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003625 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003626
Benjamin Peterson29060642009-01-31 22:14:21 +00003627 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628 Py_XDECREF(buffer);
3629 return NULL;
3630}
3631
Alexander Belopolsky40018472011-02-26 01:02:56 +00003632PyObject *
3633PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003634 const char *encoding,
3635 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003636{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003637 if (!PyUnicode_Check(unicode)) {
3638 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003639 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003640 }
3641
Serhiy Storchaka00939072016-10-27 21:05:49 +03003642 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3643 "PyUnicode_AsDecodedObject() is deprecated; "
3644 "use PyCodec_Decode() to decode from str", 1) < 0)
3645 return NULL;
3646
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003647 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003648 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003649
3650 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003651 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003652}
3653
Alexander Belopolsky40018472011-02-26 01:02:56 +00003654PyObject *
3655PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003656 const char *encoding,
3657 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003658{
3659 PyObject *v;
3660
3661 if (!PyUnicode_Check(unicode)) {
3662 PyErr_BadArgument();
3663 goto onError;
3664 }
3665
Serhiy Storchaka00939072016-10-27 21:05:49 +03003666 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3667 "PyUnicode_AsDecodedUnicode() is deprecated; "
3668 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3669 return NULL;
3670
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003671 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003672 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003673
3674 /* Decode via the codec registry */
3675 v = PyCodec_Decode(unicode, encoding, errors);
3676 if (v == NULL)
3677 goto onError;
3678 if (!PyUnicode_Check(v)) {
3679 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003680 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003681 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003682 encoding,
3683 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003684 Py_DECREF(v);
3685 goto onError;
3686 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003687 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003688
Benjamin Peterson29060642009-01-31 22:14:21 +00003689 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003690 return NULL;
3691}
3692
Alexander Belopolsky40018472011-02-26 01:02:56 +00003693PyObject *
3694PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003695 Py_ssize_t size,
3696 const char *encoding,
3697 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003698{
3699 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003700
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003701 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003702 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003703 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003704 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3705 Py_DECREF(unicode);
3706 return v;
3707}
3708
Alexander Belopolsky40018472011-02-26 01:02:56 +00003709PyObject *
3710PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003711 const char *encoding,
3712 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003713{
3714 PyObject *v;
3715
3716 if (!PyUnicode_Check(unicode)) {
3717 PyErr_BadArgument();
3718 goto onError;
3719 }
3720
Serhiy Storchaka00939072016-10-27 21:05:49 +03003721 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3722 "PyUnicode_AsEncodedObject() is deprecated; "
3723 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3724 "or PyCodec_Encode() for generic encoding", 1) < 0)
3725 return NULL;
3726
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003727 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003728 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003729
3730 /* Encode via the codec registry */
3731 v = PyCodec_Encode(unicode, encoding, errors);
3732 if (v == NULL)
3733 goto onError;
3734 return v;
3735
Benjamin Peterson29060642009-01-31 22:14:21 +00003736 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003737 return NULL;
3738}
3739
Victor Stinner1b579672011-12-17 05:47:23 +01003740
Victor Stinner2cba6b82018-01-10 22:46:15 +01003741static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003742unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003743 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003744{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003745 Py_ssize_t wlen;
3746 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3747 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003748 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003749 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003750
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003751 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003752 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003753 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003754 return NULL;
3755 }
3756
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003757 char *str;
3758 size_t error_pos;
3759 const char *reason;
3760 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003761 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003762 PyMem_Free(wstr);
3763
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003764 if (res != 0) {
3765 if (res == -2) {
3766 PyObject *exc;
3767 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3768 "locale", unicode,
3769 (Py_ssize_t)error_pos,
3770 (Py_ssize_t)(error_pos+1),
3771 reason);
3772 if (exc != NULL) {
3773 PyCodec_StrictErrors(exc);
3774 Py_DECREF(exc);
3775 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003776 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003777 else if (res == -3) {
3778 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3779 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003780 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003781 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003782 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003783 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003784 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003785
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003786 PyObject *bytes = PyBytes_FromString(str);
3787 PyMem_RawFree(str);
3788 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003789}
3790
Victor Stinnerad158722010-10-27 00:25:46 +00003791PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003792PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3793{
Victor Stinner709d23d2019-05-02 14:56:30 -04003794 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3795 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003796}
3797
3798PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003799PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003800{
Victor Stinner81a7be32020-04-14 15:14:01 +02003801 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003802 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3803 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003804 return unicode_encode_utf8(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003805 fs_codec->error_handler,
3806 fs_codec->errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003807 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003808#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003809 else if (fs_codec->encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003810 return PyUnicode_AsEncodedString(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003811 fs_codec->encoding,
3812 fs_codec->errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003813 }
Victor Stinnerad158722010-10-27 00:25:46 +00003814#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003815 else {
3816 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3817 machinery is not ready and so cannot be used:
3818 use wcstombs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003819 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3820 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003821 assert(filesystem_errors != NULL);
3822 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3823 assert(errors != _Py_ERROR_UNKNOWN);
3824#ifdef _Py_FORCE_UTF8_FS_ENCODING
3825 return unicode_encode_utf8(unicode, errors, NULL);
3826#else
3827 return unicode_encode_locale(unicode, errors, 0);
3828#endif
3829 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003830}
3831
Alexander Belopolsky40018472011-02-26 01:02:56 +00003832PyObject *
3833PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003834 const char *encoding,
3835 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003836{
3837 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003838 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003839
Guido van Rossumd57fd912000-03-10 22:53:23 +00003840 if (!PyUnicode_Check(unicode)) {
3841 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003842 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003843 }
Fred Drakee4315f52000-05-09 19:53:39 +00003844
Victor Stinner22eb6892019-06-26 00:51:05 +02003845 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3846 return NULL;
3847 }
3848
Victor Stinner942889a2016-09-05 15:40:10 -07003849 if (encoding == NULL) {
3850 return _PyUnicode_AsUTF8String(unicode, errors);
3851 }
3852
Fred Drakee4315f52000-05-09 19:53:39 +00003853 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003854 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3855 char *lower = buflower;
3856
3857 /* Fast paths */
3858 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3859 lower += 3;
3860 if (*lower == '_') {
3861 /* Match "utf8" and "utf_8" */
3862 lower++;
3863 }
3864
3865 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003866 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003867 }
3868 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3869 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3870 }
3871 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3872 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3873 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003874 }
Victor Stinner942889a2016-09-05 15:40:10 -07003875 else {
3876 if (strcmp(lower, "ascii") == 0
3877 || strcmp(lower, "us_ascii") == 0) {
3878 return _PyUnicode_AsASCIIString(unicode, errors);
3879 }
Steve Dowercc16be82016-09-08 10:35:16 -07003880#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003881 else if (strcmp(lower, "mbcs") == 0) {
3882 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3883 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003884#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003885 else if (strcmp(lower, "latin1") == 0 ||
3886 strcmp(lower, "latin_1") == 0 ||
3887 strcmp(lower, "iso_8859_1") == 0 ||
3888 strcmp(lower, "iso8859_1") == 0) {
3889 return _PyUnicode_AsLatin1String(unicode, errors);
3890 }
3891 }
Victor Stinner37296e82010-06-10 13:36:23 +00003892 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003893
3894 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003895 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003896 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003897 return NULL;
3898
3899 /* The normal path */
3900 if (PyBytes_Check(v))
3901 return v;
3902
3903 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003904 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003905 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003906 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003907
3908 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003909 "encoder %s returned bytearray instead of bytes; "
3910 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003911 encoding);
3912 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003913 Py_DECREF(v);
3914 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003915 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003916
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003917 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3918 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003919 Py_DECREF(v);
3920 return b;
3921 }
3922
3923 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003924 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003925 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003926 encoding,
3927 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003928 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003929 return NULL;
3930}
3931
Alexander Belopolsky40018472011-02-26 01:02:56 +00003932PyObject *
3933PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003934 const char *encoding,
3935 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003936{
3937 PyObject *v;
3938
3939 if (!PyUnicode_Check(unicode)) {
3940 PyErr_BadArgument();
3941 goto onError;
3942 }
3943
Serhiy Storchaka00939072016-10-27 21:05:49 +03003944 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3945 "PyUnicode_AsEncodedUnicode() is deprecated; "
3946 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3947 return NULL;
3948
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003949 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003950 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003951
3952 /* Encode via the codec registry */
3953 v = PyCodec_Encode(unicode, encoding, errors);
3954 if (v == NULL)
3955 goto onError;
3956 if (!PyUnicode_Check(v)) {
3957 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003958 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003959 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003960 encoding,
3961 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003962 Py_DECREF(v);
3963 goto onError;
3964 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003965 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003966
Benjamin Peterson29060642009-01-31 22:14:21 +00003967 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003968 return NULL;
3969}
3970
Victor Stinner2cba6b82018-01-10 22:46:15 +01003971static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003972unicode_decode_locale(const char *str, Py_ssize_t len,
3973 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003974{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003975 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3976 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003977 return NULL;
3978 }
3979
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003980 wchar_t *wstr;
3981 size_t wlen;
3982 const char *reason;
3983 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003984 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003985 if (res != 0) {
3986 if (res == -2) {
3987 PyObject *exc;
3988 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3989 "locale", str, len,
3990 (Py_ssize_t)wlen,
3991 (Py_ssize_t)(wlen + 1),
3992 reason);
3993 if (exc != NULL) {
3994 PyCodec_StrictErrors(exc);
3995 Py_DECREF(exc);
3996 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003997 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003998 else if (res == -3) {
3999 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
4000 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01004001 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004002 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01004003 }
Victor Stinner2f197072011-12-17 07:08:30 +01004004 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01004005 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004006
4007 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
4008 PyMem_RawFree(wstr);
4009 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01004010}
4011
4012PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01004013PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
4014 const char *errors)
4015{
Victor Stinner709d23d2019-05-02 14:56:30 -04004016 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4017 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01004018}
4019
4020PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01004021PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01004022{
4023 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04004024 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4025 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01004026}
4027
4028
4029PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00004030PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004031 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00004032 return PyUnicode_DecodeFSDefaultAndSize(s, size);
4033}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004034
Christian Heimes5894ba72007-11-04 11:43:14 +00004035PyObject*
4036PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
4037{
Victor Stinner81a7be32020-04-14 15:14:01 +02004038 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02004039 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4040 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04004041 return unicode_decode_utf8(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02004042 fs_codec->error_handler,
4043 fs_codec->errors,
Victor Stinner709d23d2019-05-02 14:56:30 -04004044 NULL);
4045 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004046#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02004047 else if (fs_codec->encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08004048 return PyUnicode_Decode(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02004049 fs_codec->encoding,
4050 fs_codec->errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004051 }
Victor Stinnerad158722010-10-27 00:25:46 +00004052#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004053 else {
4054 /* Before _PyUnicode_InitEncodings() is called, the Python codec
4055 machinery is not ready and so cannot be used:
4056 use mbstowcs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02004057 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4058 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004059 assert(filesystem_errors != NULL);
4060 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4061 assert(errors != _Py_ERROR_UNKNOWN);
4062#ifdef _Py_FORCE_UTF8_FS_ENCODING
4063 return unicode_decode_utf8(s, size, errors, NULL, NULL);
4064#else
4065 return unicode_decode_locale(s, size, errors, 0);
4066#endif
4067 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004068}
4069
Martin v. Löwis011e8422009-05-05 04:43:17 +00004070
4071int
4072PyUnicode_FSConverter(PyObject* arg, void* addr)
4073{
Brett Cannonec6ce872016-09-06 15:50:29 -07004074 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004075 PyObject *output = NULL;
4076 Py_ssize_t size;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004077 const char *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004078 if (arg == NULL) {
4079 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08004080 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004081 return 1;
4082 }
Brett Cannonec6ce872016-09-06 15:50:29 -07004083 path = PyOS_FSPath(arg);
4084 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03004085 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004086 }
Brett Cannonec6ce872016-09-06 15:50:29 -07004087 if (PyBytes_Check(path)) {
4088 output = path;
4089 }
4090 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
4091 output = PyUnicode_EncodeFSDefault(path);
4092 Py_DECREF(path);
4093 if (!output) {
4094 return 0;
4095 }
4096 assert(PyBytes_Check(output));
4097 }
4098
Victor Stinner0ea2a462010-04-30 00:22:08 +00004099 size = PyBytes_GET_SIZE(output);
4100 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02004101 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004102 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00004103 Py_DECREF(output);
4104 return 0;
4105 }
4106 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004107 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004108}
4109
4110
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004111int
4112PyUnicode_FSDecoder(PyObject* arg, void* addr)
4113{
Brett Cannona5711202016-09-06 19:36:01 -07004114 int is_buffer = 0;
4115 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004116 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004117 if (arg == NULL) {
4118 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03004119 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004120 return 1;
4121 }
Brett Cannona5711202016-09-06 19:36:01 -07004122
4123 is_buffer = PyObject_CheckBuffer(arg);
4124 if (!is_buffer) {
4125 path = PyOS_FSPath(arg);
4126 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03004127 return 0;
4128 }
Brett Cannona5711202016-09-06 19:36:01 -07004129 }
4130 else {
4131 path = arg;
4132 Py_INCREF(arg);
4133 }
4134
4135 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07004136 output = path;
4137 }
4138 else if (PyBytes_Check(path) || is_buffer) {
4139 PyObject *path_bytes = NULL;
4140
4141 if (!PyBytes_Check(path) &&
4142 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02004143 "path should be string, bytes, or os.PathLike, not %.200s",
4144 Py_TYPE(arg)->tp_name)) {
4145 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004146 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07004147 }
4148 path_bytes = PyBytes_FromObject(path);
4149 Py_DECREF(path);
4150 if (!path_bytes) {
4151 return 0;
4152 }
4153 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4154 PyBytes_GET_SIZE(path_bytes));
4155 Py_DECREF(path_bytes);
4156 if (!output) {
4157 return 0;
4158 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004159 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004160 else {
4161 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02004162 "path should be string, bytes, or os.PathLike, not %.200s",
4163 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07004164 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004165 return 0;
4166 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004167 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02004168 Py_DECREF(output);
4169 return 0;
4170 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004171 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02004172 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004173 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004174 Py_DECREF(output);
4175 return 0;
4176 }
4177 *(PyObject**)addr = output;
4178 return Py_CLEANUP_SUPPORTED;
4179}
4180
4181
Inada Naoki02a4d572020-02-27 13:48:59 +09004182static int unicode_fill_utf8(PyObject *unicode);
4183
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004184const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004185PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004186{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004187 if (!PyUnicode_Check(unicode)) {
4188 PyErr_BadArgument();
4189 return NULL;
4190 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004191 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004192 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004193
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004194 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004195 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004196 return NULL;
4197 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004198 }
4199
4200 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004201 *psize = PyUnicode_UTF8_LENGTH(unicode);
4202 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004203}
4204
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004205const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004206PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004207{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004208 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4209}
4210
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004211Py_UNICODE *
4212PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4213{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004214 if (!PyUnicode_Check(unicode)) {
4215 PyErr_BadArgument();
4216 return NULL;
4217 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004218 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4219 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004220 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004221 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004222 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004223
Serhiy Storchakac46db922018-10-23 22:58:24 +03004224 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4225 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4226 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004227 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004228 }
Victor Stinner32bd68c2020-12-01 10:37:39 +01004229 w = (wchar_t *) PyObject_Malloc(sizeof(wchar_t) * (wlen + 1));
Serhiy Storchakac46db922018-10-23 22:58:24 +03004230 if (w == NULL) {
4231 PyErr_NoMemory();
4232 return NULL;
4233 }
4234 unicode_copy_as_widechar(unicode, w, wlen + 1);
4235 _PyUnicode_WSTR(unicode) = w;
4236 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4237 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004238 }
4239 }
4240 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004241 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004242 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004243}
4244
Inada Naoki2c4928d2020-06-17 20:09:44 +09004245/* Deprecated APIs */
4246
4247_Py_COMP_DIAG_PUSH
4248_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4249
Alexander Belopolsky40018472011-02-26 01:02:56 +00004250Py_UNICODE *
4251PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004252{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004253 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004254}
4255
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004256const Py_UNICODE *
4257_PyUnicode_AsUnicode(PyObject *unicode)
4258{
4259 Py_ssize_t size;
4260 const Py_UNICODE *wstr;
4261
4262 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4263 if (wstr && wcslen(wstr) != (size_t)size) {
4264 PyErr_SetString(PyExc_ValueError, "embedded null character");
4265 return NULL;
4266 }
4267 return wstr;
4268}
4269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004270
Alexander Belopolsky40018472011-02-26 01:02:56 +00004271Py_ssize_t
4272PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004273{
4274 if (!PyUnicode_Check(unicode)) {
4275 PyErr_BadArgument();
4276 goto onError;
4277 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004278 if (_PyUnicode_WSTR(unicode) == NULL) {
4279 if (PyUnicode_AsUnicode(unicode) == NULL)
4280 goto onError;
4281 }
4282 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004283
Benjamin Peterson29060642009-01-31 22:14:21 +00004284 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004285 return -1;
4286}
4287
Inada Naoki2c4928d2020-06-17 20:09:44 +09004288_Py_COMP_DIAG_POP
4289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004290Py_ssize_t
4291PyUnicode_GetLength(PyObject *unicode)
4292{
Victor Stinner07621332012-06-16 04:53:46 +02004293 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004294 PyErr_BadArgument();
4295 return -1;
4296 }
Victor Stinner07621332012-06-16 04:53:46 +02004297 if (PyUnicode_READY(unicode) == -1)
4298 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004299 return PyUnicode_GET_LENGTH(unicode);
4300}
4301
4302Py_UCS4
4303PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4304{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004305 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02004306 int kind;
4307
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004308 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004309 PyErr_BadArgument();
4310 return (Py_UCS4)-1;
4311 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004312 if (PyUnicode_READY(unicode) == -1) {
4313 return (Py_UCS4)-1;
4314 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004315 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004316 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004317 return (Py_UCS4)-1;
4318 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004319 data = PyUnicode_DATA(unicode);
4320 kind = PyUnicode_KIND(unicode);
4321 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004322}
4323
4324int
4325PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4326{
4327 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004328 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004329 return -1;
4330 }
Victor Stinner488fa492011-12-12 00:01:39 +01004331 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004332 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004333 PyErr_SetString(PyExc_IndexError, "string index out of range");
4334 return -1;
4335 }
Victor Stinner488fa492011-12-12 00:01:39 +01004336 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004337 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004338 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4339 PyErr_SetString(PyExc_ValueError, "character out of range");
4340 return -1;
4341 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004342 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4343 index, ch);
4344 return 0;
4345}
4346
Alexander Belopolsky40018472011-02-26 01:02:56 +00004347const char *
4348PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004349{
Victor Stinner42cb4622010-09-01 19:39:01 +00004350 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004351}
4352
Victor Stinner554f3f02010-06-16 23:33:54 +00004353/* create or adjust a UnicodeDecodeError */
4354static void
4355make_decode_exception(PyObject **exceptionObject,
4356 const char *encoding,
4357 const char *input, Py_ssize_t length,
4358 Py_ssize_t startpos, Py_ssize_t endpos,
4359 const char *reason)
4360{
4361 if (*exceptionObject == NULL) {
4362 *exceptionObject = PyUnicodeDecodeError_Create(
4363 encoding, input, length, startpos, endpos, reason);
4364 }
4365 else {
4366 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4367 goto onError;
4368 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4369 goto onError;
4370 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4371 goto onError;
4372 }
4373 return;
4374
4375onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004376 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004377}
4378
Steve Dowercc16be82016-09-08 10:35:16 -07004379#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004380static int
4381widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4382{
4383 if (newsize > *size) {
4384 wchar_t *newbuf = *buf;
4385 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4386 PyErr_NoMemory();
4387 return -1;
4388 }
4389 *buf = newbuf;
4390 }
4391 *size = newsize;
4392 return 0;
4393}
4394
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004395/* error handling callback helper:
4396 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004397 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004398 and adjust various state variables.
4399 return 0 on success, -1 on error
4400*/
4401
Alexander Belopolsky40018472011-02-26 01:02:56 +00004402static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004403unicode_decode_call_errorhandler_wchar(
4404 const char *errors, PyObject **errorHandler,
4405 const char *encoding, const char *reason,
4406 const char **input, const char **inend, Py_ssize_t *startinpos,
4407 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004408 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004410 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004411
4412 PyObject *restuple = NULL;
4413 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004414 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004415 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004416 Py_ssize_t requiredsize;
4417 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004418 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004419 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004420
4421 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004422 *errorHandler = PyCodec_LookupError(errors);
4423 if (*errorHandler == NULL)
4424 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004425 }
4426
Victor Stinner554f3f02010-06-16 23:33:54 +00004427 make_decode_exception(exceptionObject,
4428 encoding,
4429 *input, *inend - *input,
4430 *startinpos, *endinpos,
4431 reason);
4432 if (*exceptionObject == NULL)
4433 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434
Petr Viktorinffd97532020-02-11 17:46:57 +01004435 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004437 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004438 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004439 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004440 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004441 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004442 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004443 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004444
4445 /* Copy back the bytes variables, which might have been modified by the
4446 callback */
4447 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4448 if (!inputobj)
4449 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004450 *input = PyBytes_AS_STRING(inputobj);
4451 insize = PyBytes_GET_SIZE(inputobj);
4452 *inend = *input + insize;
4453 /* we can DECREF safely, as the exception has another reference,
4454 so the object won't go away. */
4455 Py_DECREF(inputobj);
4456
4457 if (newpos<0)
4458 newpos = insize+newpos;
4459 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004460 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004461 goto onError;
4462 }
4463
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004464#if USE_UNICODE_WCHAR_CACHE
4465_Py_COMP_DIAG_PUSH
4466_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4467 repwlen = PyUnicode_GetSize(repunicode);
4468 if (repwlen < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004469 goto onError;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004470_Py_COMP_DIAG_POP
4471#else /* USE_UNICODE_WCHAR_CACHE */
4472 repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4473 if (repwlen < 0)
4474 goto onError;
4475 repwlen--;
4476#endif /* USE_UNICODE_WCHAR_CACHE */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004477 /* need more space? (at least enough for what we
4478 have+the replacement+the rest of the string (starting
4479 at the new input position), so we won't have to check space
4480 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004481 requiredsize = *outpos;
4482 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4483 goto overflow;
4484 requiredsize += repwlen;
4485 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4486 goto overflow;
4487 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004488 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004489 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004490 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004491 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004492 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004493 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004494 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004495 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004496 PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004497 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004498 *endinpos = newpos;
4499 *inptr = *input + newpos;
4500
4501 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004502 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004503 return 0;
4504
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004505 overflow:
4506 PyErr_SetString(PyExc_OverflowError,
4507 "decoded result is too long for a Python string");
4508
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004509 onError:
4510 Py_XDECREF(restuple);
4511 return -1;
4512}
Steve Dowercc16be82016-09-08 10:35:16 -07004513#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004514
4515static int
4516unicode_decode_call_errorhandler_writer(
4517 const char *errors, PyObject **errorHandler,
4518 const char *encoding, const char *reason,
4519 const char **input, const char **inend, Py_ssize_t *startinpos,
4520 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4521 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4522{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004523 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004524
4525 PyObject *restuple = NULL;
4526 PyObject *repunicode = NULL;
4527 Py_ssize_t insize;
4528 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004529 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004530 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004531 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004532 int need_to_grow = 0;
4533 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004534
4535 if (*errorHandler == NULL) {
4536 *errorHandler = PyCodec_LookupError(errors);
4537 if (*errorHandler == NULL)
4538 goto onError;
4539 }
4540
4541 make_decode_exception(exceptionObject,
4542 encoding,
4543 *input, *inend - *input,
4544 *startinpos, *endinpos,
4545 reason);
4546 if (*exceptionObject == NULL)
4547 goto onError;
4548
Petr Viktorinffd97532020-02-11 17:46:57 +01004549 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004550 if (restuple == NULL)
4551 goto onError;
4552 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004553 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004554 goto onError;
4555 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004556 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004557 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004558
4559 /* Copy back the bytes variables, which might have been modified by the
4560 callback */
4561 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4562 if (!inputobj)
4563 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004564 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004565 *input = PyBytes_AS_STRING(inputobj);
4566 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004567 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004568 /* we can DECREF safely, as the exception has another reference,
4569 so the object won't go away. */
4570 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004571
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004572 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004573 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004574 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004575 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004576 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004577 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004578
Victor Stinner170ca6f2013-04-18 00:25:28 +02004579 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004580 if (replen > 1) {
4581 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004582 need_to_grow = 1;
4583 }
4584 new_inptr = *input + newpos;
4585 if (*inend - new_inptr > remain) {
4586 /* We don't know the decoding algorithm here so we make the worst
4587 assumption that one byte decodes to one unicode character.
4588 If unfortunately one byte could decode to more unicode characters,
4589 the decoder may write out-of-bound then. Is it possible for the
4590 algorithms using this function? */
4591 writer->min_length += *inend - new_inptr - remain;
4592 need_to_grow = 1;
4593 }
4594 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004595 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004596 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004597 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4598 goto onError;
4599 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004600 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004601 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004602
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004603 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004604 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004605
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004606 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004607 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004608 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004609
Benjamin Peterson29060642009-01-31 22:14:21 +00004610 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004611 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004612 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004613}
4614
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004615/* --- UTF-7 Codec -------------------------------------------------------- */
4616
Antoine Pitrou244651a2009-05-04 18:56:13 +00004617/* See RFC2152 for details. We encode conservatively and decode liberally. */
4618
4619/* Three simple macros defining base-64. */
4620
4621/* Is c a base-64 character? */
4622
4623#define IS_BASE64(c) \
4624 (((c) >= 'A' && (c) <= 'Z') || \
4625 ((c) >= 'a' && (c) <= 'z') || \
4626 ((c) >= '0' && (c) <= '9') || \
4627 (c) == '+' || (c) == '/')
4628
4629/* given that c is a base-64 character, what is its base-64 value? */
4630
4631#define FROM_BASE64(c) \
4632 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4633 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4634 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4635 (c) == '+' ? 62 : 63)
4636
4637/* What is the base-64 character of the bottom 6 bits of n? */
4638
4639#define TO_BASE64(n) \
4640 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4641
4642/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4643 * decoded as itself. We are permissive on decoding; the only ASCII
4644 * byte not decoding to itself is the + which begins a base64
4645 * string. */
4646
4647#define DECODE_DIRECT(c) \
4648 ((c) <= 127 && (c) != '+')
4649
4650/* The UTF-7 encoder treats ASCII characters differently according to
4651 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4652 * the above). See RFC2152. This array identifies these different
4653 * sets:
4654 * 0 : "Set D"
4655 * alphanumeric and '(),-./:?
4656 * 1 : "Set O"
4657 * !"#$%&*;<=>@[]^_`{|}
4658 * 2 : "whitespace"
4659 * ht nl cr sp
4660 * 3 : special (must be base64 encoded)
4661 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4662 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004663
Tim Petersced69f82003-09-16 20:30:58 +00004664static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004665char utf7_category[128] = {
4666/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4667 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4668/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4669 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4670/* sp ! " # $ % & ' ( ) * + , - . / */
4671 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4672/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4673 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4674/* @ A B C D E F G H I J K L M N O */
4675 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4676/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4677 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4678/* ` a b c d e f g h i j k l m n o */
4679 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4680/* p q r s t u v w x y z { | } ~ del */
4681 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004682};
4683
Antoine Pitrou244651a2009-05-04 18:56:13 +00004684/* ENCODE_DIRECT: this character should be encoded as itself. The
4685 * answer depends on whether we are encoding set O as itself, and also
4686 * on whether we are encoding whitespace as itself. RFC2152 makes it
4687 * clear that the answers to these questions vary between
4688 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004689
Antoine Pitrou244651a2009-05-04 18:56:13 +00004690#define ENCODE_DIRECT(c, directO, directWS) \
4691 ((c) < 128 && (c) > 0 && \
4692 ((utf7_category[(c)] == 0) || \
4693 (directWS && (utf7_category[(c)] == 2)) || \
4694 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004695
Alexander Belopolsky40018472011-02-26 01:02:56 +00004696PyObject *
4697PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004698 Py_ssize_t size,
4699 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004700{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004701 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4702}
4703
Antoine Pitrou244651a2009-05-04 18:56:13 +00004704/* The decoder. The only state we preserve is our read position,
4705 * i.e. how many characters we have consumed. So if we end in the
4706 * middle of a shift sequence we have to back off the read position
4707 * and the output to the beginning of the sequence, otherwise we lose
4708 * all the shift state (seen bits, number of bits seen, high
4709 * surrogate). */
4710
Alexander Belopolsky40018472011-02-26 01:02:56 +00004711PyObject *
4712PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004713 Py_ssize_t size,
4714 const char *errors,
4715 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004716{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004717 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004718 Py_ssize_t startinpos;
4719 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004720 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004721 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004722 const char *errmsg = "";
4723 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004724 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004725 unsigned int base64bits = 0;
4726 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004727 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004728 PyObject *errorHandler = NULL;
4729 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004730
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004731 if (size == 0) {
4732 if (consumed)
4733 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004734 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004735 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004736
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004737 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004738 _PyUnicodeWriter_Init(&writer);
4739 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004740
4741 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004742 e = s + size;
4743
4744 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004745 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004746 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004747 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004748
Antoine Pitrou244651a2009-05-04 18:56:13 +00004749 if (inShift) { /* in a base-64 section */
4750 if (IS_BASE64(ch)) { /* consume a base-64 character */
4751 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4752 base64bits += 6;
4753 s++;
4754 if (base64bits >= 16) {
4755 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004756 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004757 base64bits -= 16;
4758 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004759 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004760 if (surrogate) {
4761 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004762 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4763 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004764 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004765 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004766 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004767 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004768 }
4769 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004770 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004771 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004772 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004773 }
4774 }
Victor Stinner551ac952011-11-29 22:58:13 +01004775 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004776 /* first surrogate */
4777 surrogate = outCh;
4778 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004779 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004780 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004781 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004782 }
4783 }
4784 }
4785 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004786 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004787 if (base64bits > 0) { /* left-over bits */
4788 if (base64bits >= 6) {
4789 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004790 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004791 errmsg = "partial character in shift sequence";
4792 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004793 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004794 else {
4795 /* Some bits remain; they should be zero */
4796 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004797 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004798 errmsg = "non-zero padding bits in shift sequence";
4799 goto utf7Error;
4800 }
4801 }
4802 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004803 if (surrogate && DECODE_DIRECT(ch)) {
4804 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4805 goto onError;
4806 }
4807 surrogate = 0;
4808 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004809 /* '-' is absorbed; other terminating
4810 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004811 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004812 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004813 }
4814 }
4815 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004816 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004817 s++; /* consume '+' */
4818 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004819 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004820 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004821 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004822 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004823 else if (s < e && !IS_BASE64(*s)) {
4824 s++;
4825 errmsg = "ill-formed sequence";
4826 goto utf7Error;
4827 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004828 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004829 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004830 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004831 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004832 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004833 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004834 }
4835 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004836 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004837 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004838 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004839 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004840 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004841 else {
4842 startinpos = s-starts;
4843 s++;
4844 errmsg = "unexpected special character";
4845 goto utf7Error;
4846 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004847 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004848utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004849 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004850 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004851 errors, &errorHandler,
4852 "utf7", errmsg,
4853 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004854 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004855 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004856 }
4857
Antoine Pitrou244651a2009-05-04 18:56:13 +00004858 /* end of string */
4859
4860 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4861 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004862 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004863 if (surrogate ||
4864 (base64bits >= 6) ||
4865 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004866 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004867 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004868 errors, &errorHandler,
4869 "utf7", "unterminated shift sequence",
4870 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004871 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004872 goto onError;
4873 if (s < e)
4874 goto restart;
4875 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004876 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004877
4878 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004879 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004880 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004881 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004882 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004883 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004884 writer.kind, writer.data, shiftOutStart);
4885 Py_XDECREF(errorHandler);
4886 Py_XDECREF(exc);
4887 _PyUnicodeWriter_Dealloc(&writer);
4888 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004889 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004890 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004891 }
4892 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004893 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004894 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004895 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004896
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004897 Py_XDECREF(errorHandler);
4898 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004899 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004900
Benjamin Peterson29060642009-01-31 22:14:21 +00004901 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004902 Py_XDECREF(errorHandler);
4903 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004904 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004905 return NULL;
4906}
4907
4908
Alexander Belopolsky40018472011-02-26 01:02:56 +00004909PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004910_PyUnicode_EncodeUTF7(PyObject *str,
4911 int base64SetO,
4912 int base64WhiteSpace,
4913 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004914{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004915 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004916 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004917 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004918 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004919 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004920 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004921 unsigned int base64bits = 0;
4922 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004923 char * out;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004924 const char * start;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004925
Benjamin Petersonbac79492012-01-14 13:34:47 -05004926 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004927 return NULL;
4928 kind = PyUnicode_KIND(str);
4929 data = PyUnicode_DATA(str);
4930 len = PyUnicode_GET_LENGTH(str);
4931
4932 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004933 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004934
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004935 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004936 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004937 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004938 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004939 if (v == NULL)
4940 return NULL;
4941
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004942 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004943 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004944 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004945
Antoine Pitrou244651a2009-05-04 18:56:13 +00004946 if (inShift) {
4947 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4948 /* shifting out */
4949 if (base64bits) { /* output remaining bits */
4950 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4951 base64buffer = 0;
4952 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004953 }
4954 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004955 /* Characters not in the BASE64 set implicitly unshift the sequence
4956 so no '-' is required, except if the character is itself a '-' */
4957 if (IS_BASE64(ch) || ch == '-') {
4958 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004959 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004960 *out++ = (char) ch;
4961 }
4962 else {
4963 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004964 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004965 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004966 else { /* not in a shift sequence */
4967 if (ch == '+') {
4968 *out++ = '+';
4969 *out++ = '-';
4970 }
4971 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4972 *out++ = (char) ch;
4973 }
4974 else {
4975 *out++ = '+';
4976 inShift = 1;
4977 goto encode_char;
4978 }
4979 }
4980 continue;
4981encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004982 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004983 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004984
Antoine Pitrou244651a2009-05-04 18:56:13 +00004985 /* code first surrogate */
4986 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004987 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004988 while (base64bits >= 6) {
4989 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4990 base64bits -= 6;
4991 }
4992 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004993 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004994 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004995 base64bits += 16;
4996 base64buffer = (base64buffer << 16) | ch;
4997 while (base64bits >= 6) {
4998 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4999 base64bits -= 6;
5000 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00005001 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00005002 if (base64bits)
5003 *out++= TO_BASE64(base64buffer << (6-base64bits) );
5004 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005005 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005006 if (_PyBytes_Resize(&v, out - start) < 0)
5007 return NULL;
5008 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005009}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005010PyObject *
5011PyUnicode_EncodeUTF7(const Py_UNICODE *s,
5012 Py_ssize_t size,
5013 int base64SetO,
5014 int base64WhiteSpace,
5015 const char *errors)
5016{
5017 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005018 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005019 if (tmp == NULL)
5020 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01005021 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005022 base64WhiteSpace, errors);
5023 Py_DECREF(tmp);
5024 return result;
5025}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005026
Antoine Pitrou244651a2009-05-04 18:56:13 +00005027#undef IS_BASE64
5028#undef FROM_BASE64
5029#undef TO_BASE64
5030#undef DECODE_DIRECT
5031#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005032
Guido van Rossumd57fd912000-03-10 22:53:23 +00005033/* --- UTF-8 Codec -------------------------------------------------------- */
5034
Alexander Belopolsky40018472011-02-26 01:02:56 +00005035PyObject *
5036PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005037 Py_ssize_t size,
5038 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005039{
Walter Dörwald69652032004-09-07 20:24:22 +00005040 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
5041}
5042
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005043#include "stringlib/asciilib.h"
5044#include "stringlib/codecs.h"
5045#include "stringlib/undef.h"
5046
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01005047#include "stringlib/ucs1lib.h"
5048#include "stringlib/codecs.h"
5049#include "stringlib/undef.h"
5050
5051#include "stringlib/ucs2lib.h"
5052#include "stringlib/codecs.h"
5053#include "stringlib/undef.h"
5054
5055#include "stringlib/ucs4lib.h"
5056#include "stringlib/codecs.h"
5057#include "stringlib/undef.h"
5058
Ma Lina0c603c2020-10-18 22:48:38 +08005059/* Mask to quickly check whether a C 'size_t' contains a
Antoine Pitrouab868312009-01-10 15:40:25 +00005060 non-ASCII, UTF8-encoded char. */
Ma Lina0c603c2020-10-18 22:48:38 +08005061#if (SIZEOF_SIZE_T == 8)
5062# define ASCII_CHAR_MASK 0x8080808080808080ULL
5063#elif (SIZEOF_SIZE_T == 4)
5064# define ASCII_CHAR_MASK 0x80808080U
Antoine Pitrouab868312009-01-10 15:40:25 +00005065#else
Ma Lina0c603c2020-10-18 22:48:38 +08005066# error C 'size_t' size should be either 4 or 8!
Antoine Pitrouab868312009-01-10 15:40:25 +00005067#endif
5068
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005069static Py_ssize_t
5070ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005071{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005072 const char *p = start;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005073
Ma Lina0c603c2020-10-18 22:48:38 +08005074#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
Jessica Clarkedec07572021-03-31 11:12:39 +01005075 assert(_Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T));
5076 if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005077 /* Fast path, see in STRINGLIB(utf8_decode) for
5078 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02005079 /* Help allocation */
5080 const char *_p = p;
5081 Py_UCS1 * q = dest;
Jessica Clarkedec07572021-03-31 11:12:39 +01005082 while (_p + SIZEOF_SIZE_T <= end) {
Ma Lina0c603c2020-10-18 22:48:38 +08005083 size_t value = *(const size_t *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005084 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00005085 break;
Ma Lina0c603c2020-10-18 22:48:38 +08005086 *((size_t *)q) = value;
5087 _p += SIZEOF_SIZE_T;
5088 q += SIZEOF_SIZE_T;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005089 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005090 p = _p;
5091 while (p < end) {
5092 if ((unsigned char)*p & 0x80)
5093 break;
5094 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005095 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005096 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005097 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005098#endif
5099 while (p < end) {
5100 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5101 for an explanation. */
Jessica Clarkedec07572021-03-31 11:12:39 +01005102 if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02005103 /* Help allocation */
5104 const char *_p = p;
Jessica Clarkedec07572021-03-31 11:12:39 +01005105 while (_p + SIZEOF_SIZE_T <= end) {
Ma Lina0c603c2020-10-18 22:48:38 +08005106 size_t value = *(const size_t *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005107 if (value & ASCII_CHAR_MASK)
5108 break;
Ma Lina0c603c2020-10-18 22:48:38 +08005109 _p += SIZEOF_SIZE_T;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005110 }
5111 p = _p;
5112 if (_p == end)
5113 break;
5114 }
5115 if ((unsigned char)*p & 0x80)
5116 break;
5117 ++p;
5118 }
5119 memcpy(dest, start, p - start);
5120 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121}
Antoine Pitrouab868312009-01-10 15:40:25 +00005122
Victor Stinner709d23d2019-05-02 14:56:30 -04005123static PyObject *
5124unicode_decode_utf8(const char *s, Py_ssize_t size,
5125 _Py_error_handler error_handler, const char *errors,
5126 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01005127{
Victor Stinner785938e2011-12-11 20:09:03 +01005128 if (size == 0) {
5129 if (consumed)
5130 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005131 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01005132 }
5133
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005134 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5135 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner2f9ada92020-06-24 02:22:21 +02005136 if (consumed) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005137 *consumed = 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02005138 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005139 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01005140 }
5141
Inada Naoki770847a2019-06-24 12:30:24 +09005142 const char *starts = s;
5143 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01005144
Inada Naoki770847a2019-06-24 12:30:24 +09005145 // fast path: try ASCII string.
5146 PyObject *u = PyUnicode_New(size, 127);
5147 if (u == NULL) {
5148 return NULL;
5149 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005150 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09005151 if (s == end) {
5152 return u;
5153 }
5154
5155 // Use _PyUnicodeWriter after fast path is failed.
5156 _PyUnicodeWriter writer;
5157 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5158 writer.pos = s - starts;
5159
5160 Py_ssize_t startinpos, endinpos;
5161 const char *errmsg = "";
5162 PyObject *error_handler_obj = NULL;
5163 PyObject *exc = NULL;
5164
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005165 while (s < end) {
5166 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005167 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005168
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005169 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005170 if (PyUnicode_IS_ASCII(writer.buffer))
5171 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005172 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005173 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005174 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005175 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005176 } else {
5177 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005178 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005179 }
5180
5181 switch (ch) {
5182 case 0:
5183 if (s == end || consumed)
5184 goto End;
5185 errmsg = "unexpected end of data";
5186 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005187 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005188 break;
5189 case 1:
5190 errmsg = "invalid start byte";
5191 startinpos = s - starts;
5192 endinpos = startinpos + 1;
5193 break;
5194 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005195 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5196 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5197 {
5198 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005199 goto End;
5200 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005201 /* fall through */
5202 case 3:
5203 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005204 errmsg = "invalid continuation byte";
5205 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005206 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005207 break;
5208 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005209 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005210 goto onError;
5211 continue;
5212 }
5213
Victor Stinner1d65d912015-10-05 13:43:50 +02005214 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005215 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005216
5217 switch (error_handler) {
5218 case _Py_ERROR_IGNORE:
5219 s += (endinpos - startinpos);
5220 break;
5221
5222 case _Py_ERROR_REPLACE:
5223 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5224 goto onError;
5225 s += (endinpos - startinpos);
5226 break;
5227
5228 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005229 {
5230 Py_ssize_t i;
5231
Victor Stinner1d65d912015-10-05 13:43:50 +02005232 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5233 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005234 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005235 ch = (Py_UCS4)(unsigned char)(starts[i]);
5236 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5237 ch + 0xdc00);
5238 writer.pos++;
5239 }
5240 s += (endinpos - startinpos);
5241 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005242 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005243
5244 default:
5245 if (unicode_decode_call_errorhandler_writer(
5246 errors, &error_handler_obj,
5247 "utf-8", errmsg,
5248 &starts, &end, &startinpos, &endinpos, &exc, &s,
5249 &writer))
5250 goto onError;
5251 }
Victor Stinner785938e2011-12-11 20:09:03 +01005252 }
5253
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005254End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005255 if (consumed)
5256 *consumed = s - starts;
5257
Victor Stinner1d65d912015-10-05 13:43:50 +02005258 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005259 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005260 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005261
5262onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005263 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005264 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005265 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005266 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005267}
5268
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005269
Victor Stinner709d23d2019-05-02 14:56:30 -04005270PyObject *
5271PyUnicode_DecodeUTF8Stateful(const char *s,
5272 Py_ssize_t size,
5273 const char *errors,
5274 Py_ssize_t *consumed)
5275{
5276 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5277}
5278
5279
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005280/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5281 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005282
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005283 On success, write a pointer to a newly allocated wide character string into
5284 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5285 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005286
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005287 On memory allocation failure, return -1.
5288
5289 On decoding error (if surrogateescape is zero), return -2. If wlen is
5290 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5291 is not NULL, write the decoding error message into *reason. */
5292int
5293_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005294 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005295{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005296 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005297 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005298 wchar_t *unicode;
5299 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005300
Victor Stinner3d4226a2018-08-29 22:21:32 +02005301 int surrogateescape = 0;
5302 int surrogatepass = 0;
5303 switch (errors)
5304 {
5305 case _Py_ERROR_STRICT:
5306 break;
5307 case _Py_ERROR_SURROGATEESCAPE:
5308 surrogateescape = 1;
5309 break;
5310 case _Py_ERROR_SURROGATEPASS:
5311 surrogatepass = 1;
5312 break;
5313 default:
5314 return -3;
5315 }
5316
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005317 /* Note: size will always be longer than the resulting Unicode
5318 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005319 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005320 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005321 }
5322
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005323 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005324 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005325 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005326 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005327
5328 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005329 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005330 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005331 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005332 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005333#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005334 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005335#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005336 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005337#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005338 if (ch > 0xFF) {
5339#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005340 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005341#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005342 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005343 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005344 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5345 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5346#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005347 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005348 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005349 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005350 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005351 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005352
5353 if (surrogateescape) {
5354 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5355 }
5356 else {
5357 /* Is it a valid three-byte code? */
5358 if (surrogatepass
5359 && (e - s) >= 3
5360 && (s[0] & 0xf0) == 0xe0
5361 && (s[1] & 0xc0) == 0x80
5362 && (s[2] & 0xc0) == 0x80)
5363 {
5364 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5365 s += 3;
5366 unicode[outpos++] = ch;
5367 }
5368 else {
5369 PyMem_RawFree(unicode );
5370 if (reason != NULL) {
5371 switch (ch) {
5372 case 0:
5373 *reason = "unexpected end of data";
5374 break;
5375 case 1:
5376 *reason = "invalid start byte";
5377 break;
5378 /* 2, 3, 4 */
5379 default:
5380 *reason = "invalid continuation byte";
5381 break;
5382 }
5383 }
5384 if (wlen != NULL) {
5385 *wlen = s - orig_s;
5386 }
5387 return -2;
5388 }
5389 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005390 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005391 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005392 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005393 if (wlen) {
5394 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005395 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005396 *wstr = unicode;
5397 return 0;
5398}
5399
Victor Stinner5f9cf232019-03-19 01:46:25 +01005400
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005401wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005402_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5403 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005404{
5405 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005406 int res = _Py_DecodeUTF8Ex(arg, arglen,
5407 &wstr, wlen,
5408 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005409 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005410 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5411 assert(res != -3);
5412 if (wlen) {
5413 *wlen = (size_t)res;
5414 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005415 return NULL;
5416 }
5417 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005418}
5419
Antoine Pitrouab868312009-01-10 15:40:25 +00005420
Victor Stinnere47e6982017-12-21 15:45:16 +01005421/* UTF-8 encoder using the surrogateescape error handler .
5422
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005423 On success, return 0 and write the newly allocated character string (use
5424 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005425
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005426 On encoding failure, return -2 and write the position of the invalid
5427 surrogate character into *error_pos (if error_pos is set) and the decoding
5428 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005429
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005430 On memory allocation failure, return -1. */
5431int
5432_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005433 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005434{
5435 const Py_ssize_t max_char_size = 4;
5436 Py_ssize_t len = wcslen(text);
5437
5438 assert(len >= 0);
5439
Victor Stinner3d4226a2018-08-29 22:21:32 +02005440 int surrogateescape = 0;
5441 int surrogatepass = 0;
5442 switch (errors)
5443 {
5444 case _Py_ERROR_STRICT:
5445 break;
5446 case _Py_ERROR_SURROGATEESCAPE:
5447 surrogateescape = 1;
5448 break;
5449 case _Py_ERROR_SURROGATEPASS:
5450 surrogatepass = 1;
5451 break;
5452 default:
5453 return -3;
5454 }
5455
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005456 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5457 return -1;
5458 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005459 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005460 if (raw_malloc) {
5461 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005462 }
5463 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005464 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005465 }
5466 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005467 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005468 }
5469
5470 char *p = bytes;
5471 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005472 for (i = 0; i < len; ) {
5473 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005474 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005475 i++;
5476#if Py_UNICODE_SIZE == 2
5477 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5478 && i < len
5479 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5480 {
5481 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5482 i++;
5483 }
5484#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005485
5486 if (ch < 0x80) {
5487 /* Encode ASCII */
5488 *p++ = (char) ch;
5489
5490 }
5491 else if (ch < 0x0800) {
5492 /* Encode Latin-1 */
5493 *p++ = (char)(0xc0 | (ch >> 6));
5494 *p++ = (char)(0x80 | (ch & 0x3f));
5495 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005496 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005497 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005498 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005499 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005500 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005501 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005502 if (reason != NULL) {
5503 *reason = "encoding error";
5504 }
5505 if (raw_malloc) {
5506 PyMem_RawFree(bytes);
5507 }
5508 else {
5509 PyMem_Free(bytes);
5510 }
5511 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005512 }
5513 *p++ = (char)(ch & 0xff);
5514 }
5515 else if (ch < 0x10000) {
5516 *p++ = (char)(0xe0 | (ch >> 12));
5517 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5518 *p++ = (char)(0x80 | (ch & 0x3f));
5519 }
5520 else { /* ch >= 0x10000 */
5521 assert(ch <= MAX_UNICODE);
5522 /* Encode UCS4 Unicode ordinals */
5523 *p++ = (char)(0xf0 | (ch >> 18));
5524 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5525 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5526 *p++ = (char)(0x80 | (ch & 0x3f));
5527 }
5528 }
5529 *p++ = '\0';
5530
5531 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005532 char *bytes2;
5533 if (raw_malloc) {
5534 bytes2 = PyMem_RawRealloc(bytes, final_size);
5535 }
5536 else {
5537 bytes2 = PyMem_Realloc(bytes, final_size);
5538 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005539 if (bytes2 == NULL) {
5540 if (error_pos != NULL) {
5541 *error_pos = (size_t)-1;
5542 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005543 if (raw_malloc) {
5544 PyMem_RawFree(bytes);
5545 }
5546 else {
5547 PyMem_Free(bytes);
5548 }
5549 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005550 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005551 *str = bytes2;
5552 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005553}
5554
5555
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005556/* Primary internal function which creates utf8 encoded bytes objects.
5557
5558 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005559 and allocate exactly as much space needed at the end. Else allocate the
5560 maximum possible needed (4 result bytes per Unicode character), and return
5561 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005562*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005563static PyObject *
5564unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5565 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005567 if (!PyUnicode_Check(unicode)) {
5568 PyErr_BadArgument();
5569 return NULL;
5570 }
5571
5572 if (PyUnicode_READY(unicode) == -1)
5573 return NULL;
5574
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005575 if (PyUnicode_UTF8(unicode))
5576 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5577 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005578
Inada Naoki02a4d572020-02-27 13:48:59 +09005579 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005580 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005581 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5582
5583 _PyBytesWriter writer;
5584 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005585
Benjamin Petersonead6b532011-12-20 17:23:42 -06005586 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005587 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005588 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005589 case PyUnicode_1BYTE_KIND:
5590 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5591 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005592 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5593 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005594 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005595 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5596 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005597 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005598 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5599 break;
Tim Peters602f7402002-04-27 18:03:26 +00005600 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005601
5602 if (end == NULL) {
5603 _PyBytesWriter_Dealloc(&writer);
5604 return NULL;
5605 }
5606 return _PyBytesWriter_Finish(&writer, end);
5607}
5608
5609static int
5610unicode_fill_utf8(PyObject *unicode)
5611{
5612 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5613 assert(!PyUnicode_IS_ASCII(unicode));
5614
5615 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005616 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005617 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5618
5619 _PyBytesWriter writer;
5620 char *end;
5621
5622 switch (kind) {
5623 default:
5624 Py_UNREACHABLE();
5625 case PyUnicode_1BYTE_KIND:
5626 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5627 _Py_ERROR_STRICT, NULL);
5628 break;
5629 case PyUnicode_2BYTE_KIND:
5630 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5631 _Py_ERROR_STRICT, NULL);
5632 break;
5633 case PyUnicode_4BYTE_KIND:
5634 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5635 _Py_ERROR_STRICT, NULL);
5636 break;
5637 }
5638 if (end == NULL) {
5639 _PyBytesWriter_Dealloc(&writer);
5640 return -1;
5641 }
5642
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03005643 const char *start = writer.use_small_buffer ? writer.small_buffer :
Inada Naoki02a4d572020-02-27 13:48:59 +09005644 PyBytes_AS_STRING(writer.buffer);
5645 Py_ssize_t len = end - start;
5646
Victor Stinner32bd68c2020-12-01 10:37:39 +01005647 char *cache = PyObject_Malloc(len + 1);
Inada Naoki02a4d572020-02-27 13:48:59 +09005648 if (cache == NULL) {
5649 _PyBytesWriter_Dealloc(&writer);
5650 PyErr_NoMemory();
5651 return -1;
5652 }
5653 _PyUnicode_UTF8(unicode) = cache;
5654 _PyUnicode_UTF8_LENGTH(unicode) = len;
5655 memcpy(cache, start, len);
5656 cache[len] = '\0';
5657 _PyBytesWriter_Dealloc(&writer);
5658 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659}
5660
Alexander Belopolsky40018472011-02-26 01:02:56 +00005661PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005662_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5663{
5664 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5665}
5666
5667
5668PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005669PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5670 Py_ssize_t size,
5671 const char *errors)
5672{
5673 PyObject *v, *unicode;
5674
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005675 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005676 if (unicode == NULL)
5677 return NULL;
5678 v = _PyUnicode_AsUTF8String(unicode, errors);
5679 Py_DECREF(unicode);
5680 return v;
5681}
5682
5683PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005684PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005685{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005686 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687}
5688
Walter Dörwald41980ca2007-08-16 21:55:45 +00005689/* --- UTF-32 Codec ------------------------------------------------------- */
5690
5691PyObject *
5692PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005693 Py_ssize_t size,
5694 const char *errors,
5695 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005696{
5697 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5698}
5699
5700PyObject *
5701PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005702 Py_ssize_t size,
5703 const char *errors,
5704 int *byteorder,
5705 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005706{
5707 const char *starts = s;
5708 Py_ssize_t startinpos;
5709 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005710 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005711 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005712 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005713 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005714 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005715 PyObject *errorHandler = NULL;
5716 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005717
Andy Lestere6be9b52020-02-11 20:28:35 -06005718 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005719 e = q + size;
5720
5721 if (byteorder)
5722 bo = *byteorder;
5723
5724 /* Check for BOM marks (U+FEFF) in the input and adjust current
5725 byte order setting accordingly. In native mode, the leading BOM
5726 mark is skipped, in all other modes, it is copied to the output
5727 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005728 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005729 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005730 if (bom == 0x0000FEFF) {
5731 bo = -1;
5732 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005733 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005734 else if (bom == 0xFFFE0000) {
5735 bo = 1;
5736 q += 4;
5737 }
5738 if (byteorder)
5739 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005740 }
5741
Victor Stinnere64322e2012-10-30 23:12:47 +01005742 if (q == e) {
5743 if (consumed)
5744 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005745 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005746 }
5747
Victor Stinnere64322e2012-10-30 23:12:47 +01005748#ifdef WORDS_BIGENDIAN
5749 le = bo < 0;
5750#else
5751 le = bo <= 0;
5752#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005753 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005754
Victor Stinner8f674cc2013-04-17 23:02:17 +02005755 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005756 writer.min_length = (e - q + 3) / 4;
5757 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005758 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005759
Victor Stinnere64322e2012-10-30 23:12:47 +01005760 while (1) {
5761 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005762 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005763
Victor Stinnere64322e2012-10-30 23:12:47 +01005764 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005765 enum PyUnicode_Kind kind = writer.kind;
5766 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005767 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005768 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005769 if (le) {
5770 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005771 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005772 if (ch > maxch)
5773 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005774 if (kind != PyUnicode_1BYTE_KIND &&
5775 Py_UNICODE_IS_SURROGATE(ch))
5776 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005777 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005778 q += 4;
5779 } while (q <= last);
5780 }
5781 else {
5782 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005783 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005784 if (ch > maxch)
5785 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005786 if (kind != PyUnicode_1BYTE_KIND &&
5787 Py_UNICODE_IS_SURROGATE(ch))
5788 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005789 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005790 q += 4;
5791 } while (q <= last);
5792 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005793 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005794 }
5795
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005796 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005797 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005798 startinpos = ((const char *)q) - starts;
5799 endinpos = startinpos + 4;
5800 }
5801 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005802 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005803 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005804 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005805 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005806 startinpos = ((const char *)q) - starts;
5807 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005808 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005809 else {
5810 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005811 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005812 goto onError;
5813 q += 4;
5814 continue;
5815 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005816 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005817 startinpos = ((const char *)q) - starts;
5818 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005819 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005820
5821 /* The remaining input chars are ignored if the callback
5822 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005823 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005824 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005825 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005826 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005827 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005828 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005829 }
5830
Walter Dörwald41980ca2007-08-16 21:55:45 +00005831 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005833
Walter Dörwald41980ca2007-08-16 21:55:45 +00005834 Py_XDECREF(errorHandler);
5835 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005836 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005837
Benjamin Peterson29060642009-01-31 22:14:21 +00005838 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005839 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005840 Py_XDECREF(errorHandler);
5841 Py_XDECREF(exc);
5842 return NULL;
5843}
5844
5845PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005846_PyUnicode_EncodeUTF32(PyObject *str,
5847 const char *errors,
5848 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005849{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005850 enum PyUnicode_Kind kind;
5851 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005852 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005853 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005854 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005855#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005856 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005857#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005858 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005859#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005860 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005861 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005862 PyObject *errorHandler = NULL;
5863 PyObject *exc = NULL;
5864 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005865
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005866 if (!PyUnicode_Check(str)) {
5867 PyErr_BadArgument();
5868 return NULL;
5869 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005870 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005871 return NULL;
5872 kind = PyUnicode_KIND(str);
5873 data = PyUnicode_DATA(str);
5874 len = PyUnicode_GET_LENGTH(str);
5875
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005876 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005877 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005878 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005879 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005880 if (v == NULL)
5881 return NULL;
5882
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005883 /* output buffer is 4-bytes aligned */
5884 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005885 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005886 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005887 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005888 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005889 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005890
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005891 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005892 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005893 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005894 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005895 else
5896 encoding = "utf-32";
5897
5898 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005899 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5900 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005901 }
5902
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005903 pos = 0;
5904 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005905 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005906
5907 if (kind == PyUnicode_2BYTE_KIND) {
5908 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5909 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005910 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005911 else {
5912 assert(kind == PyUnicode_4BYTE_KIND);
5913 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5914 &out, native_ordering);
5915 }
5916 if (pos == len)
5917 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005918
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005919 rep = unicode_encode_call_errorhandler(
5920 errors, &errorHandler,
5921 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005922 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005923 if (!rep)
5924 goto error;
5925
5926 if (PyBytes_Check(rep)) {
5927 repsize = PyBytes_GET_SIZE(rep);
5928 if (repsize & 3) {
5929 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005930 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005931 "surrogates not allowed");
5932 goto error;
5933 }
5934 moreunits = repsize / 4;
5935 }
5936 else {
5937 assert(PyUnicode_Check(rep));
5938 if (PyUnicode_READY(rep) < 0)
5939 goto error;
5940 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5941 if (!PyUnicode_IS_ASCII(rep)) {
5942 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005943 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005944 "surrogates not allowed");
5945 goto error;
5946 }
5947 }
5948
5949 /* four bytes are reserved for each surrogate */
5950 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005951 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005952 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005953 /* integer overflow */
5954 PyErr_NoMemory();
5955 goto error;
5956 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005957 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005958 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005959 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005960 }
5961
5962 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005963 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005964 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005965 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005966 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005967 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5968 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005969 }
5970
5971 Py_CLEAR(rep);
5972 }
5973
5974 /* Cut back to size actually needed. This is necessary for, for example,
5975 encoding of a string containing isolated surrogates and the 'ignore'
5976 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005977 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005978 if (nsize != PyBytes_GET_SIZE(v))
5979 _PyBytes_Resize(&v, nsize);
5980 Py_XDECREF(errorHandler);
5981 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005982 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005983 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005984 error:
5985 Py_XDECREF(rep);
5986 Py_XDECREF(errorHandler);
5987 Py_XDECREF(exc);
5988 Py_XDECREF(v);
5989 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005990}
5991
Alexander Belopolsky40018472011-02-26 01:02:56 +00005992PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005993PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5994 Py_ssize_t size,
5995 const char *errors,
5996 int byteorder)
5997{
5998 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005999 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006000 if (tmp == NULL)
6001 return NULL;
6002 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
6003 Py_DECREF(tmp);
6004 return result;
6005}
6006
6007PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006008PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00006009{
Victor Stinnerb960b342011-11-20 19:12:52 +01006010 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00006011}
6012
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013/* --- UTF-16 Codec ------------------------------------------------------- */
6014
Tim Peters772747b2001-08-09 22:21:55 +00006015PyObject *
6016PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006017 Py_ssize_t size,
6018 const char *errors,
6019 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020{
Walter Dörwald69652032004-09-07 20:24:22 +00006021 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6022}
6023
6024PyObject *
6025PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006026 Py_ssize_t size,
6027 const char *errors,
6028 int *byteorder,
6029 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00006030{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006031 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006032 Py_ssize_t startinpos;
6033 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006034 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006035 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00006036 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02006037 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00006038 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006039 PyObject *errorHandler = NULL;
6040 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006041 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042
Andy Lestere6be9b52020-02-11 20:28:35 -06006043 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006044 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045
6046 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00006047 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00006049 /* Check for BOM marks (U+FEFF) in the input and adjust current
6050 byte order setting accordingly. In native mode, the leading BOM
6051 mark is skipped, in all other modes, it is copied to the output
6052 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02006053 if (bo == 0 && size >= 2) {
6054 const Py_UCS4 bom = (q[1] << 8) | q[0];
6055 if (bom == 0xFEFF) {
6056 q += 2;
6057 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006058 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02006059 else if (bom == 0xFFFE) {
6060 q += 2;
6061 bo = 1;
6062 }
6063 if (byteorder)
6064 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00006065 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066
Antoine Pitrou63065d72012-05-15 23:48:04 +02006067 if (q == e) {
6068 if (consumed)
6069 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006070 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00006071 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02006072
Christian Heimes743e0cd2012-10-17 23:52:17 +02006073#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02006074 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006075 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00006076#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02006077 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006078 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00006079#endif
Tim Peters772747b2001-08-09 22:21:55 +00006080
Antoine Pitrou63065d72012-05-15 23:48:04 +02006081 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08006082 character count normally. Error handler will take care of
6083 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006084 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006085 writer.min_length = (e - q + 1) / 2;
6086 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006087 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006088
Antoine Pitrou63065d72012-05-15 23:48:04 +02006089 while (1) {
6090 Py_UCS4 ch = 0;
6091 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006092 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006093 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006094 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02006095 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006096 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006097 native_ordering);
6098 else
6099 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006100 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006101 native_ordering);
6102 } else if (kind == PyUnicode_2BYTE_KIND) {
6103 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006104 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006105 native_ordering);
6106 } else {
6107 assert(kind == PyUnicode_4BYTE_KIND);
6108 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006109 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006110 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00006111 }
Antoine Pitrouab868312009-01-10 15:40:25 +00006112 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006113
Antoine Pitrou63065d72012-05-15 23:48:04 +02006114 switch (ch)
6115 {
6116 case 0:
6117 /* remaining byte at the end? (size should be even) */
6118 if (q == e || consumed)
6119 goto End;
6120 errmsg = "truncated data";
6121 startinpos = ((const char *)q) - starts;
6122 endinpos = ((const char *)e) - starts;
6123 break;
6124 /* The remaining input chars are ignored if the callback
6125 chooses to skip the input */
6126 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006127 q -= 2;
6128 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02006129 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006130 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006131 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006132 endinpos = ((const char *)e) - starts;
6133 break;
6134 case 2:
6135 errmsg = "illegal encoding";
6136 startinpos = ((const char *)q) - 2 - starts;
6137 endinpos = startinpos + 2;
6138 break;
6139 case 3:
6140 errmsg = "illegal UTF-16 surrogate";
6141 startinpos = ((const char *)q) - 4 - starts;
6142 endinpos = startinpos + 2;
6143 break;
6144 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006145 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006146 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006147 continue;
6148 }
6149
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006150 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00006151 errors,
6152 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006153 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00006154 &starts,
6155 (const char **)&e,
6156 &startinpos,
6157 &endinpos,
6158 &exc,
6159 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006160 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006161 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162 }
6163
Antoine Pitrou63065d72012-05-15 23:48:04 +02006164End:
Walter Dörwald69652032004-09-07 20:24:22 +00006165 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006166 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00006167
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006168 Py_XDECREF(errorHandler);
6169 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006170 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171
Benjamin Peterson29060642009-01-31 22:14:21 +00006172 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006173 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006174 Py_XDECREF(errorHandler);
6175 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006176 return NULL;
6177}
6178
Tim Peters772747b2001-08-09 22:21:55 +00006179PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006180_PyUnicode_EncodeUTF16(PyObject *str,
6181 const char *errors,
6182 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006184 enum PyUnicode_Kind kind;
6185 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006186 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006187 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006188 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006189 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02006190#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006191 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006192#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006193 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006194#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006195 const char *encoding;
6196 Py_ssize_t nsize, pos;
6197 PyObject *errorHandler = NULL;
6198 PyObject *exc = NULL;
6199 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006200
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006201 if (!PyUnicode_Check(str)) {
6202 PyErr_BadArgument();
6203 return NULL;
6204 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006205 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006206 return NULL;
6207 kind = PyUnicode_KIND(str);
6208 data = PyUnicode_DATA(str);
6209 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006210
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006211 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006212 if (kind == PyUnicode_4BYTE_KIND) {
6213 const Py_UCS4 *in = (const Py_UCS4 *)data;
6214 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006215 while (in < end) {
6216 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006217 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006218 }
6219 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006220 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006221 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006222 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006223 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006224 nsize = len + pairs + (byteorder == 0);
6225 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006226 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006228 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006230 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006231 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006232 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006233 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006234 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006235 }
6236 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006237 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006238 }
Tim Peters772747b2001-08-09 22:21:55 +00006239
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006240 if (kind == PyUnicode_1BYTE_KIND) {
6241 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6242 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006243 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006244
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006245 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006246 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006247 }
6248 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006249 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006250 }
6251 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006252 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006253 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006254
6255 pos = 0;
6256 while (pos < len) {
6257 Py_ssize_t repsize, moreunits;
6258
6259 if (kind == PyUnicode_2BYTE_KIND) {
6260 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6261 &out, native_ordering);
6262 }
6263 else {
6264 assert(kind == PyUnicode_4BYTE_KIND);
6265 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6266 &out, native_ordering);
6267 }
6268 if (pos == len)
6269 break;
6270
6271 rep = unicode_encode_call_errorhandler(
6272 errors, &errorHandler,
6273 encoding, "surrogates not allowed",
6274 str, &exc, pos, pos + 1, &pos);
6275 if (!rep)
6276 goto error;
6277
6278 if (PyBytes_Check(rep)) {
6279 repsize = PyBytes_GET_SIZE(rep);
6280 if (repsize & 1) {
6281 raise_encode_exception(&exc, encoding,
6282 str, pos - 1, pos,
6283 "surrogates not allowed");
6284 goto error;
6285 }
6286 moreunits = repsize / 2;
6287 }
6288 else {
6289 assert(PyUnicode_Check(rep));
6290 if (PyUnicode_READY(rep) < 0)
6291 goto error;
6292 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6293 if (!PyUnicode_IS_ASCII(rep)) {
6294 raise_encode_exception(&exc, encoding,
6295 str, pos - 1, pos,
6296 "surrogates not allowed");
6297 goto error;
6298 }
6299 }
6300
6301 /* two bytes are reserved for each surrogate */
6302 if (moreunits > 1) {
6303 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006304 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006305 /* integer overflow */
6306 PyErr_NoMemory();
6307 goto error;
6308 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006309 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006310 goto error;
6311 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6312 }
6313
6314 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006315 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006316 out += moreunits;
6317 } else /* rep is unicode */ {
6318 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6319 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6320 &out, native_ordering);
6321 }
6322
6323 Py_CLEAR(rep);
6324 }
6325
6326 /* Cut back to size actually needed. This is necessary for, for example,
6327 encoding of a string containing isolated surrogates and the 'ignore' handler
6328 is used. */
6329 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6330 if (nsize != PyBytes_GET_SIZE(v))
6331 _PyBytes_Resize(&v, nsize);
6332 Py_XDECREF(errorHandler);
6333 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006334 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006335 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006336 error:
6337 Py_XDECREF(rep);
6338 Py_XDECREF(errorHandler);
6339 Py_XDECREF(exc);
6340 Py_XDECREF(v);
6341 return NULL;
6342#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343}
6344
Alexander Belopolsky40018472011-02-26 01:02:56 +00006345PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006346PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6347 Py_ssize_t size,
6348 const char *errors,
6349 int byteorder)
6350{
6351 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006352 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006353 if (tmp == NULL)
6354 return NULL;
6355 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6356 Py_DECREF(tmp);
6357 return result;
6358}
6359
6360PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006361PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006363 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364}
6365
6366/* --- Unicode Escape Codec ----------------------------------------------- */
6367
Victor Stinner47e1afd2020-10-26 16:43:47 +01006368static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006369
Alexander Belopolsky40018472011-02-26 01:02:56 +00006370PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006371_PyUnicode_DecodeUnicodeEscape(const char *s,
6372 Py_ssize_t size,
6373 const char *errors,
6374 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006375{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006376 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006377 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006379 PyObject *errorHandler = NULL;
6380 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006381
Eric V. Smith42454af2016-10-31 09:22:08 -04006382 // so we can remember if we've seen an invalid escape char or not
6383 *first_invalid_escape = NULL;
6384
Victor Stinner62ec3312016-09-06 17:04:34 -07006385 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006386 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006387 }
6388 /* Escaped strings will always be longer than the resulting
6389 Unicode string, so we start with size here and then reduce the
6390 length after conversion to the true value.
6391 (but if the error callback returns a long replacement string
6392 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006393 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006394 writer.min_length = size;
6395 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6396 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006397 }
6398
Guido van Rossumd57fd912000-03-10 22:53:23 +00006399 end = s + size;
6400 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006401 unsigned char c = (unsigned char) *s++;
6402 Py_UCS4 ch;
6403 int count;
6404 Py_ssize_t startinpos;
6405 Py_ssize_t endinpos;
6406 const char *message;
6407
6408#define WRITE_ASCII_CHAR(ch) \
6409 do { \
6410 assert(ch <= 127); \
6411 assert(writer.pos < writer.size); \
6412 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6413 } while(0)
6414
6415#define WRITE_CHAR(ch) \
6416 do { \
6417 if (ch <= writer.maxchar) { \
6418 assert(writer.pos < writer.size); \
6419 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6420 } \
6421 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6422 goto onError; \
6423 } \
6424 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425
6426 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006427 if (c != '\\') {
6428 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006429 continue;
6430 }
6431
Victor Stinner62ec3312016-09-06 17:04:34 -07006432 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006433 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006434 if (s >= end) {
6435 message = "\\ at end of string";
6436 goto error;
6437 }
6438 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006439
Victor Stinner62ec3312016-09-06 17:04:34 -07006440 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006441 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442
Benjamin Peterson29060642009-01-31 22:14:21 +00006443 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006444 case '\n': continue;
6445 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6446 case '\'': WRITE_ASCII_CHAR('\''); continue;
6447 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6448 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006449 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006450 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6451 case 't': WRITE_ASCII_CHAR('\t'); continue;
6452 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6453 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006454 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006455 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006456 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006457 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458
Benjamin Peterson29060642009-01-31 22:14:21 +00006459 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006460 case '0': case '1': case '2': case '3':
6461 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006462 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006463 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006464 ch = (ch<<3) + *s++ - '0';
6465 if (s < end && '0' <= *s && *s <= '7') {
6466 ch = (ch<<3) + *s++ - '0';
6467 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006469 WRITE_CHAR(ch);
6470 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471
Benjamin Peterson29060642009-01-31 22:14:21 +00006472 /* hex escapes */
6473 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006475 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006476 message = "truncated \\xXX escape";
6477 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478
Benjamin Peterson29060642009-01-31 22:14:21 +00006479 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006481 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006482 message = "truncated \\uXXXX escape";
6483 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484
Benjamin Peterson29060642009-01-31 22:14:21 +00006485 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006486 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006487 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006488 message = "truncated \\UXXXXXXXX escape";
6489 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006490 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006491 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006492 ch <<= 4;
6493 if (c >= '0' && c <= '9') {
6494 ch += c - '0';
6495 }
6496 else if (c >= 'a' && c <= 'f') {
6497 ch += c - ('a' - 10);
6498 }
6499 else if (c >= 'A' && c <= 'F') {
6500 ch += c - ('A' - 10);
6501 }
6502 else {
6503 break;
6504 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006505 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006506 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006507 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006508 }
6509
6510 /* when we get here, ch is a 32-bit unicode character */
6511 if (ch > MAX_UNICODE) {
6512 message = "illegal Unicode character";
6513 goto error;
6514 }
6515
6516 WRITE_CHAR(ch);
6517 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006518
Benjamin Peterson29060642009-01-31 22:14:21 +00006519 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006520 case 'N':
Victor Stinner47e1afd2020-10-26 16:43:47 +01006521 if (ucnhash_capi == NULL) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006522 /* load the unicode data module */
Victor Stinner47e1afd2020-10-26 16:43:47 +01006523 ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006524 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner47e1afd2020-10-26 16:43:47 +01006525 if (ucnhash_capi == NULL) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006526 PyErr_SetString(
6527 PyExc_UnicodeError,
6528 "\\N escapes not supported (can't load unicodedata module)"
6529 );
6530 goto onError;
6531 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006532 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006533
6534 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006535 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006536 const char *start = ++s;
6537 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006538 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006539 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006540 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006541 namelen = s - start;
6542 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006543 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006544 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006545 ch = 0xffffffff; /* in case 'getcode' messes up */
6546 if (namelen <= INT_MAX &&
Victor Stinner920cb642020-10-26 19:19:36 +01006547 ucnhash_capi->getcode(start, (int)namelen,
Victor Stinner62ec3312016-09-06 17:04:34 -07006548 &ch, 0)) {
6549 assert(ch <= MAX_UNICODE);
6550 WRITE_CHAR(ch);
6551 continue;
6552 }
6553 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006554 }
6555 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006556 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006557
6558 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006559 if (*first_invalid_escape == NULL) {
6560 *first_invalid_escape = s-1; /* Back up one char, since we've
6561 already incremented s. */
6562 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006563 WRITE_ASCII_CHAR('\\');
6564 WRITE_CHAR(c);
6565 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006567
6568 error:
6569 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006570 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006571 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006572 errors, &errorHandler,
6573 "unicodeescape", message,
6574 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006575 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006576 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006577 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006578 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006579
6580#undef WRITE_ASCII_CHAR
6581#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006583
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006584 Py_XDECREF(errorHandler);
6585 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006586 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006587
Benjamin Peterson29060642009-01-31 22:14:21 +00006588 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006589 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006590 Py_XDECREF(errorHandler);
6591 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592 return NULL;
6593}
6594
Eric V. Smith42454af2016-10-31 09:22:08 -04006595PyObject *
6596PyUnicode_DecodeUnicodeEscape(const char *s,
6597 Py_ssize_t size,
6598 const char *errors)
6599{
6600 const char *first_invalid_escape;
6601 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6602 &first_invalid_escape);
6603 if (result == NULL)
6604 return NULL;
6605 if (first_invalid_escape != NULL) {
6606 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6607 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006608 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006609 Py_DECREF(result);
6610 return NULL;
6611 }
6612 }
6613 return result;
6614}
6615
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006616/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617
Alexander Belopolsky40018472011-02-26 01:02:56 +00006618PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006619PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006620{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006621 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006622 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006624 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006625 const void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006626 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627
Ezio Melottie7f90372012-10-05 03:33:31 +03006628 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006629 escape.
6630
Ezio Melottie7f90372012-10-05 03:33:31 +03006631 For UCS1 strings it's '\xxx', 4 bytes per source character.
6632 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6633 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006634 */
6635
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006636 if (!PyUnicode_Check(unicode)) {
6637 PyErr_BadArgument();
6638 return NULL;
6639 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006640 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006641 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006642 }
Victor Stinner358af132015-10-12 22:36:57 +02006643
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006644 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006645 if (len == 0) {
6646 return PyBytes_FromStringAndSize(NULL, 0);
6647 }
6648
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006649 kind = PyUnicode_KIND(unicode);
6650 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006651 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6652 bytes, and 1 byte characters 4. */
6653 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006654 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006655 return PyErr_NoMemory();
6656 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006657 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006658 if (repr == NULL) {
6659 return NULL;
6660 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006661
Victor Stinner62ec3312016-09-06 17:04:34 -07006662 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006663 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006664 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006665
Victor Stinner62ec3312016-09-06 17:04:34 -07006666 /* U+0000-U+00ff range */
6667 if (ch < 0x100) {
6668 if (ch >= ' ' && ch < 127) {
6669 if (ch != '\\') {
6670 /* Copy printable US ASCII as-is */
6671 *p++ = (char) ch;
6672 }
6673 /* Escape backslashes */
6674 else {
6675 *p++ = '\\';
6676 *p++ = '\\';
6677 }
6678 }
Victor Stinner358af132015-10-12 22:36:57 +02006679
Victor Stinner62ec3312016-09-06 17:04:34 -07006680 /* Map special whitespace to '\t', \n', '\r' */
6681 else if (ch == '\t') {
6682 *p++ = '\\';
6683 *p++ = 't';
6684 }
6685 else if (ch == '\n') {
6686 *p++ = '\\';
6687 *p++ = 'n';
6688 }
6689 else if (ch == '\r') {
6690 *p++ = '\\';
6691 *p++ = 'r';
6692 }
6693
6694 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6695 else {
6696 *p++ = '\\';
6697 *p++ = 'x';
6698 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6699 *p++ = Py_hexdigits[ch & 0x000F];
6700 }
Tim Petersced69f82003-09-16 20:30:58 +00006701 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006702 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006703 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 *p++ = '\\';
6705 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006706 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6707 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6708 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6709 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006711 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6712 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006713
Victor Stinner62ec3312016-09-06 17:04:34 -07006714 /* Make sure that the first two digits are zero */
6715 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006716 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006717 *p++ = 'U';
6718 *p++ = '0';
6719 *p++ = '0';
6720 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6721 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6722 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6723 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6724 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6725 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006726 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006727 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728
Victor Stinner62ec3312016-09-06 17:04:34 -07006729 assert(p - PyBytes_AS_STRING(repr) > 0);
6730 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6731 return NULL;
6732 }
6733 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734}
6735
Alexander Belopolsky40018472011-02-26 01:02:56 +00006736PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006737PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6738 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006740 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006741 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006742 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006744 }
6745
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006746 result = PyUnicode_AsUnicodeEscapeString(tmp);
6747 Py_DECREF(tmp);
6748 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749}
6750
6751/* --- Raw Unicode Escape Codec ------------------------------------------- */
6752
Alexander Belopolsky40018472011-02-26 01:02:56 +00006753PyObject *
6754PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006755 Py_ssize_t size,
6756 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006758 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006759 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006761 PyObject *errorHandler = NULL;
6762 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006763
Victor Stinner62ec3312016-09-06 17:04:34 -07006764 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006765 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006766 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006767
Guido van Rossumd57fd912000-03-10 22:53:23 +00006768 /* Escaped strings will always be longer than the resulting
6769 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006770 length after conversion to the true value. (But decoding error
6771 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006772 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006773 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006774 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6775 goto onError;
6776 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006777
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778 end = s + size;
6779 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006780 unsigned char c = (unsigned char) *s++;
6781 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006782 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006783 Py_ssize_t startinpos;
6784 Py_ssize_t endinpos;
6785 const char *message;
6786
6787#define WRITE_CHAR(ch) \
6788 do { \
6789 if (ch <= writer.maxchar) { \
6790 assert(writer.pos < writer.size); \
6791 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6792 } \
6793 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6794 goto onError; \
6795 } \
6796 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006797
Benjamin Peterson29060642009-01-31 22:14:21 +00006798 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006799 if (c != '\\' || s >= end) {
6800 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006801 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006802 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006803
Victor Stinner62ec3312016-09-06 17:04:34 -07006804 c = (unsigned char) *s++;
6805 if (c == 'u') {
6806 count = 4;
6807 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006808 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006809 else if (c == 'U') {
6810 count = 8;
6811 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006812 }
6813 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006814 assert(writer.pos < writer.size);
6815 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6816 WRITE_CHAR(c);
6817 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006818 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006819 startinpos = s - starts - 2;
6820
6821 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6822 for (ch = 0; count && s < end; ++s, --count) {
6823 c = (unsigned char)*s;
6824 ch <<= 4;
6825 if (c >= '0' && c <= '9') {
6826 ch += c - '0';
6827 }
6828 else if (c >= 'a' && c <= 'f') {
6829 ch += c - ('a' - 10);
6830 }
6831 else if (c >= 'A' && c <= 'F') {
6832 ch += c - ('A' - 10);
6833 }
6834 else {
6835 break;
6836 }
6837 }
6838 if (!count) {
6839 if (ch <= MAX_UNICODE) {
6840 WRITE_CHAR(ch);
6841 continue;
6842 }
6843 message = "\\Uxxxxxxxx out of range";
6844 }
6845
6846 endinpos = s-starts;
6847 writer.min_length = end - s + writer.pos;
6848 if (unicode_decode_call_errorhandler_writer(
6849 errors, &errorHandler,
6850 "rawunicodeescape", message,
6851 &starts, &end, &startinpos, &endinpos, &exc, &s,
6852 &writer)) {
6853 goto onError;
6854 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006855 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006856
6857#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006859 Py_XDECREF(errorHandler);
6860 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006861 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006862
Benjamin Peterson29060642009-01-31 22:14:21 +00006863 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006864 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006865 Py_XDECREF(errorHandler);
6866 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006868
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869}
6870
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006871
Alexander Belopolsky40018472011-02-26 01:02:56 +00006872PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006873PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006874{
Victor Stinner62ec3312016-09-06 17:04:34 -07006875 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006877 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006878 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006879 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006880 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006881
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006882 if (!PyUnicode_Check(unicode)) {
6883 PyErr_BadArgument();
6884 return NULL;
6885 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006886 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006887 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006888 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006889 kind = PyUnicode_KIND(unicode);
6890 data = PyUnicode_DATA(unicode);
6891 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006892 if (kind == PyUnicode_1BYTE_KIND) {
6893 return PyBytes_FromStringAndSize(data, len);
6894 }
Victor Stinner0e368262011-11-10 20:12:49 +01006895
Victor Stinner62ec3312016-09-06 17:04:34 -07006896 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6897 bytes, and 1 byte characters 4. */
6898 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006899
Victor Stinner62ec3312016-09-06 17:04:34 -07006900 if (len > PY_SSIZE_T_MAX / expandsize) {
6901 return PyErr_NoMemory();
6902 }
6903 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6904 if (repr == NULL) {
6905 return NULL;
6906 }
6907 if (len == 0) {
6908 return repr;
6909 }
6910
6911 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006912 for (pos = 0; pos < len; pos++) {
6913 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006914
Victor Stinner62ec3312016-09-06 17:04:34 -07006915 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6916 if (ch < 0x100) {
6917 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006918 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006919 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006920 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006921 *p++ = '\\';
6922 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006923 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6924 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6925 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6926 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006927 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006928 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6929 else {
6930 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6931 *p++ = '\\';
6932 *p++ = 'U';
6933 *p++ = '0';
6934 *p++ = '0';
6935 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6936 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6937 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6938 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6939 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6940 *p++ = Py_hexdigits[ch & 15];
6941 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006942 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006943
Victor Stinner62ec3312016-09-06 17:04:34 -07006944 assert(p > PyBytes_AS_STRING(repr));
6945 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6946 return NULL;
6947 }
6948 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949}
6950
Alexander Belopolsky40018472011-02-26 01:02:56 +00006951PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006952PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6953 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006954{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006955 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006956 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006957 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006958 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006959 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6960 Py_DECREF(tmp);
6961 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962}
6963
6964/* --- Latin-1 Codec ------------------------------------------------------ */
6965
Alexander Belopolsky40018472011-02-26 01:02:56 +00006966PyObject *
6967PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006968 Py_ssize_t size,
6969 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006971 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06006972 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006973}
6974
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006975/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006976static void
6977make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006978 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006979 PyObject *unicode,
6980 Py_ssize_t startpos, Py_ssize_t endpos,
6981 const char *reason)
6982{
6983 if (*exceptionObject == NULL) {
6984 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006985 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006986 encoding, unicode, startpos, endpos, reason);
6987 }
6988 else {
6989 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6990 goto onError;
6991 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6992 goto onError;
6993 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6994 goto onError;
6995 return;
6996 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006997 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006998 }
6999}
7000
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007001/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007002static void
7003raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007004 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01007005 PyObject *unicode,
7006 Py_ssize_t startpos, Py_ssize_t endpos,
7007 const char *reason)
7008{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007009 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01007010 encoding, unicode, startpos, endpos, reason);
7011 if (*exceptionObject != NULL)
7012 PyCodec_StrictErrors(*exceptionObject);
7013}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007014
7015/* error handling callback helper:
7016 build arguments, call the callback and check the arguments,
7017 put the result into newpos and return the replacement string, which
7018 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007019static PyObject *
7020unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007021 PyObject **errorHandler,
7022 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007023 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007024 Py_ssize_t startpos, Py_ssize_t endpos,
7025 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007026{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02007027 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007028 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007029 PyObject *restuple;
7030 PyObject *resunicode;
7031
7032 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007033 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007034 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007035 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007036 }
7037
Benjamin Petersonbac79492012-01-14 13:34:47 -05007038 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007039 return NULL;
7040 len = PyUnicode_GET_LENGTH(unicode);
7041
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007042 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007043 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007044 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007045 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007046
Petr Viktorinffd97532020-02-11 17:46:57 +01007047 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007048 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007049 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007050 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007051 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007052 Py_DECREF(restuple);
7053 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007054 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007055 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00007056 &resunicode, newpos)) {
7057 Py_DECREF(restuple);
7058 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007059 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007060 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7061 PyErr_SetString(PyExc_TypeError, &argparse[3]);
7062 Py_DECREF(restuple);
7063 return NULL;
7064 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007065 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007066 *newpos = len + *newpos;
7067 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02007068 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007069 Py_DECREF(restuple);
7070 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007071 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007072 Py_INCREF(resunicode);
7073 Py_DECREF(restuple);
7074 return resunicode;
7075}
7076
Alexander Belopolsky40018472011-02-26 01:02:56 +00007077static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007078unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007079 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02007080 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007081{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007082 /* input state */
7083 Py_ssize_t pos=0, size;
7084 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007085 const void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007086 /* pointer into the output */
7087 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007088 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7089 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02007090 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007091 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02007092 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007093 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007094 /* output object */
7095 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007096
Benjamin Petersonbac79492012-01-14 13:34:47 -05007097 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007098 return NULL;
7099 size = PyUnicode_GET_LENGTH(unicode);
7100 kind = PyUnicode_KIND(unicode);
7101 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007102 /* allocate enough for a simple encoding without
7103 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00007104 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00007105 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007106
7107 _PyBytesWriter_Init(&writer);
7108 str = _PyBytesWriter_Alloc(&writer, size);
7109 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00007110 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007111
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007112 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02007113 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007114
Benjamin Peterson29060642009-01-31 22:14:21 +00007115 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02007116 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007117 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02007118 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007119 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007120 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007121 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02007122 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007123 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007124 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007125 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00007126 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02007127
Benjamin Petersona1c1be42014-09-29 18:18:57 -04007128 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00007129 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02007130
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007131 /* Only overallocate the buffer if it's not the last write */
7132 writer.overallocate = (collend < size);
7133
Benjamin Peterson29060642009-01-31 22:14:21 +00007134 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02007135 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007136 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02007137
7138 switch (error_handler) {
7139 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007140 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007141 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02007142
7143 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02007144 memset(str, '?', collend - collstart);
7145 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02007146 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02007147 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007148 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007149 break;
Victor Stinner50149202015-09-22 00:26:54 +02007150
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007151 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007152 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007153 writer.min_size -= (collend - collstart);
7154 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007155 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007156 if (str == NULL)
7157 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007158 pos = collend;
7159 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007160
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007161 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007162 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007163 writer.min_size -= (collend - collstart);
7164 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007165 unicode, collstart, collend);
7166 if (str == NULL)
7167 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007168 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007169 break;
Victor Stinner50149202015-09-22 00:26:54 +02007170
Victor Stinnerc3713e92015-09-29 12:32:13 +02007171 case _Py_ERROR_SURROGATEESCAPE:
7172 for (i = collstart; i < collend; ++i) {
7173 ch = PyUnicode_READ(kind, data, i);
7174 if (ch < 0xdc80 || 0xdcff < ch) {
7175 /* Not a UTF-8b surrogate */
7176 break;
7177 }
7178 *str++ = (char)(ch - 0xdc00);
7179 ++pos;
7180 }
7181 if (i >= collend)
7182 break;
7183 collstart = pos;
7184 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02007185 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02007186
Benjamin Peterson29060642009-01-31 22:14:21 +00007187 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007188 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7189 encoding, reason, unicode, &exc,
7190 collstart, collend, &newpos);
7191 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007192 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02007193
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007194 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08007195 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02007196
Victor Stinner6bd525b2015-10-09 13:10:05 +02007197 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007198 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007199 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007200 PyBytes_AS_STRING(rep),
7201 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007202 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007203 else {
7204 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007205
Victor Stinner6bd525b2015-10-09 13:10:05 +02007206 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007207 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007208
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007209 if (limit == 256 ?
7210 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7211 !PyUnicode_IS_ASCII(rep))
7212 {
7213 /* Not all characters are smaller than limit */
7214 raise_encode_exception(&exc, encoding, unicode,
7215 collstart, collend, reason);
7216 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007217 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007218 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7219 str = _PyBytesWriter_WriteBytes(&writer, str,
7220 PyUnicode_DATA(rep),
7221 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007222 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007223 if (str == NULL)
7224 goto onError;
7225
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007226 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007227 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007228 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007229
7230 /* If overallocation was disabled, ensure that it was the last
7231 write. Otherwise, we missed an optimization */
7232 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007233 }
7234 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007235
Victor Stinner50149202015-09-22 00:26:54 +02007236 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007237 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007238 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007239
7240 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007241 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007242 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007243 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007244 Py_XDECREF(exc);
7245 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007246}
7247
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007248/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007249PyObject *
7250PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007251 Py_ssize_t size,
7252 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007253{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007254 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007255 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007256 if (unicode == NULL)
7257 return NULL;
7258 result = unicode_encode_ucs1(unicode, errors, 256);
7259 Py_DECREF(unicode);
7260 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007261}
7262
Alexander Belopolsky40018472011-02-26 01:02:56 +00007263PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007264_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007265{
7266 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007267 PyErr_BadArgument();
7268 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007269 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007270 if (PyUnicode_READY(unicode) == -1)
7271 return NULL;
7272 /* Fast path: if it is a one-byte string, construct
7273 bytes object directly. */
7274 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7275 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7276 PyUnicode_GET_LENGTH(unicode));
7277 /* Non-Latin-1 characters present. Defer to above function to
7278 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007279 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007280}
7281
7282PyObject*
7283PyUnicode_AsLatin1String(PyObject *unicode)
7284{
7285 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007286}
7287
7288/* --- 7-bit ASCII Codec -------------------------------------------------- */
7289
Alexander Belopolsky40018472011-02-26 01:02:56 +00007290PyObject *
7291PyUnicode_DecodeASCII(const char *s,
7292 Py_ssize_t size,
7293 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007295 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007296 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007297 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007298 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007299 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007300
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007302 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007303
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner2f9ada92020-06-24 02:22:21 +02007305 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02007306 return get_latin1_char((unsigned char)s[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02007307 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007308
Inada Naoki770847a2019-06-24 12:30:24 +09007309 // Shortcut for simple case
7310 PyObject *u = PyUnicode_New(size, 127);
7311 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007312 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007313 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007314 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09007315 if (outpos == size) {
7316 return u;
7317 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007318
Inada Naoki770847a2019-06-24 12:30:24 +09007319 _PyUnicodeWriter writer;
7320 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007321 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007322
Inada Naoki770847a2019-06-24 12:30:24 +09007323 s += outpos;
7324 int kind = writer.kind;
7325 void *data = writer.data;
7326 Py_ssize_t startinpos, endinpos;
7327
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007328 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007329 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007330 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007331 PyUnicode_WRITE(kind, data, writer.pos, c);
7332 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007333 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007334 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007335 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007336
7337 /* byte outsize range 0x00..0x7f: call the error handler */
7338
7339 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007340 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007341
7342 switch (error_handler)
7343 {
7344 case _Py_ERROR_REPLACE:
7345 case _Py_ERROR_SURROGATEESCAPE:
7346 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007347 but we may switch to UCS2 at the first write */
7348 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7349 goto onError;
7350 kind = writer.kind;
7351 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007352
7353 if (error_handler == _Py_ERROR_REPLACE)
7354 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7355 else
7356 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7357 writer.pos++;
7358 ++s;
7359 break;
7360
7361 case _Py_ERROR_IGNORE:
7362 ++s;
7363 break;
7364
7365 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007366 startinpos = s-starts;
7367 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007368 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007369 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007370 "ascii", "ordinal not in range(128)",
7371 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007372 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007373 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007374 kind = writer.kind;
7375 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007376 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007377 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007378 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007379 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007380 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007381
Benjamin Peterson29060642009-01-31 22:14:21 +00007382 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007383 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007384 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007385 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007386 return NULL;
7387}
7388
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007389/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007390PyObject *
7391PyUnicode_EncodeASCII(const Py_UNICODE *p,
7392 Py_ssize_t size,
7393 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007394{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007395 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007396 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007397 if (unicode == NULL)
7398 return NULL;
7399 result = unicode_encode_ucs1(unicode, errors, 128);
7400 Py_DECREF(unicode);
7401 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007402}
7403
Alexander Belopolsky40018472011-02-26 01:02:56 +00007404PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007405_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007406{
7407 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007408 PyErr_BadArgument();
7409 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007410 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007411 if (PyUnicode_READY(unicode) == -1)
7412 return NULL;
7413 /* Fast path: if it is an ASCII-only string, construct bytes object
7414 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007415 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007416 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7417 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007418 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007419}
7420
7421PyObject *
7422PyUnicode_AsASCIIString(PyObject *unicode)
7423{
7424 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007425}
7426
Steve Dowercc16be82016-09-08 10:35:16 -07007427#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007428
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007429/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007430
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007431#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007432#define NEED_RETRY
7433#endif
7434
Steve Dower7ebdda02019-08-21 16:22:33 -07007435/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7436 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7437 both cases also and avoids partial characters overrunning the
7438 length limit in MultiByteToWideChar on Windows */
7439#define DECODING_CHUNK_SIZE (INT_MAX/4)
7440
Victor Stinner3a50e702011-10-18 21:21:00 +02007441#ifndef WC_ERR_INVALID_CHARS
7442# define WC_ERR_INVALID_CHARS 0x0080
7443#endif
7444
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007445static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007446code_page_name(UINT code_page, PyObject **obj)
7447{
7448 *obj = NULL;
7449 if (code_page == CP_ACP)
7450 return "mbcs";
7451 if (code_page == CP_UTF7)
7452 return "CP_UTF7";
7453 if (code_page == CP_UTF8)
7454 return "CP_UTF8";
7455
7456 *obj = PyBytes_FromFormat("cp%u", code_page);
7457 if (*obj == NULL)
7458 return NULL;
7459 return PyBytes_AS_STRING(*obj);
7460}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007461
Victor Stinner3a50e702011-10-18 21:21:00 +02007462static DWORD
7463decode_code_page_flags(UINT code_page)
7464{
7465 if (code_page == CP_UTF7) {
7466 /* The CP_UTF7 decoder only supports flags=0 */
7467 return 0;
7468 }
7469 else
7470 return MB_ERR_INVALID_CHARS;
7471}
7472
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007473/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007474 * Decode a byte string from a Windows code page into unicode object in strict
7475 * mode.
7476 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007477 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7478 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007479 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007480static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007481decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007482 wchar_t **buf,
7483 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007484 const char *in,
7485 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007486{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007487 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007488 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007489 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007490
7491 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007492 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007493 while ((outsize = MultiByteToWideChar(code_page, flags,
7494 in, insize, NULL, 0)) <= 0)
7495 {
7496 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7497 goto error;
7498 }
7499 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7500 flags = 0;
7501 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007502
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007503 /* Extend a wchar_t* buffer */
7504 Py_ssize_t n = *bufsize; /* Get the current length */
7505 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7506 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007507 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007508 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007509
7510 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007511 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7512 if (outsize <= 0)
7513 goto error;
7514 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007515
Victor Stinner3a50e702011-10-18 21:21:00 +02007516error:
7517 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7518 return -2;
7519 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007520 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007521}
7522
Victor Stinner3a50e702011-10-18 21:21:00 +02007523/*
7524 * Decode a byte string from a code page into unicode object with an error
7525 * handler.
7526 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007527 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007528 * UnicodeDecodeError exception and returns -1 on error.
7529 */
7530static int
7531decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007532 wchar_t **buf,
7533 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007534 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007535 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007536{
7537 const char *startin = in;
7538 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007539 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007540 /* Ideally, we should get reason from FormatMessage. This is the Windows
7541 2000 English version of the message. */
7542 const char *reason = "No mapping for the Unicode character exists "
7543 "in the target code page.";
7544 /* each step cannot decode more than 1 character, but a character can be
7545 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007546 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007547 int insize;
7548 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007549 PyObject *errorHandler = NULL;
7550 PyObject *exc = NULL;
7551 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007552 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007553 DWORD err;
7554 int ret = -1;
7555
7556 assert(size > 0);
7557
7558 encoding = code_page_name(code_page, &encoding_obj);
7559 if (encoding == NULL)
7560 return -1;
7561
Victor Stinner7d00cc12014-03-17 23:08:06 +01007562 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007563 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7564 UnicodeDecodeError. */
7565 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7566 if (exc != NULL) {
7567 PyCodec_StrictErrors(exc);
7568 Py_CLEAR(exc);
7569 }
7570 goto error;
7571 }
7572
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007573 /* Extend a wchar_t* buffer */
7574 Py_ssize_t n = *bufsize; /* Get the current length */
7575 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7576 PyErr_NoMemory();
7577 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007578 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007579 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7580 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007581 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007582 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007583
7584 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007585 while (in < endin)
7586 {
7587 /* Decode a character */
7588 insize = 1;
7589 do
7590 {
7591 outsize = MultiByteToWideChar(code_page, flags,
7592 in, insize,
7593 buffer, Py_ARRAY_LENGTH(buffer));
7594 if (outsize > 0)
7595 break;
7596 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007597 if (err == ERROR_INVALID_FLAGS && flags) {
7598 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7599 flags = 0;
7600 continue;
7601 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007602 if (err != ERROR_NO_UNICODE_TRANSLATION
7603 && err != ERROR_INSUFFICIENT_BUFFER)
7604 {
7605 PyErr_SetFromWindowsErr(0);
7606 goto error;
7607 }
7608 insize++;
7609 }
7610 /* 4=maximum length of a UTF-8 sequence */
7611 while (insize <= 4 && (in + insize) <= endin);
7612
7613 if (outsize <= 0) {
7614 Py_ssize_t startinpos, endinpos, outpos;
7615
Victor Stinner7d00cc12014-03-17 23:08:06 +01007616 /* last character in partial decode? */
7617 if (in + insize >= endin && !final)
7618 break;
7619
Victor Stinner3a50e702011-10-18 21:21:00 +02007620 startinpos = in - startin;
7621 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007622 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007623 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007624 errors, &errorHandler,
7625 encoding, reason,
7626 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007627 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007628 {
7629 goto error;
7630 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007631 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007632 }
7633 else {
7634 in += insize;
7635 memcpy(out, buffer, outsize * sizeof(wchar_t));
7636 out += outsize;
7637 }
7638 }
7639
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007640 /* Shrink the buffer */
7641 assert(out - *buf <= *bufsize);
7642 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007643 /* (in - startin) <= size and size is an int */
7644 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007645
7646error:
7647 Py_XDECREF(encoding_obj);
7648 Py_XDECREF(errorHandler);
7649 Py_XDECREF(exc);
7650 return ret;
7651}
7652
Victor Stinner3a50e702011-10-18 21:21:00 +02007653static PyObject *
7654decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007655 const char *s, Py_ssize_t size,
7656 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007657{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007658 wchar_t *buf = NULL;
7659 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007660 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007661
Victor Stinner3a50e702011-10-18 21:21:00 +02007662 if (code_page < 0) {
7663 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7664 return NULL;
7665 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007666 if (size < 0) {
7667 PyErr_BadInternalCall();
7668 return NULL;
7669 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007670
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007671 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007672 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007673
Victor Stinner76a31a62011-11-04 00:05:13 +01007674 do
7675 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007676#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007677 if (size > DECODING_CHUNK_SIZE) {
7678 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007679 final = 0;
7680 done = 0;
7681 }
7682 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007683#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007684 {
7685 chunk_size = (int)size;
7686 final = (consumed == NULL);
7687 done = 1;
7688 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007689
Victor Stinner76a31a62011-11-04 00:05:13 +01007690 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007691 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007692 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007693 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007694 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007695
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007696 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007697 s, chunk_size);
7698 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007699 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007700 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007701 errors, final);
7702 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007703
7704 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007705 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007706 return NULL;
7707 }
7708
7709 if (consumed)
7710 *consumed += converted;
7711
7712 s += converted;
7713 size -= converted;
7714 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007715
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007716 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7717 PyMem_Free(buf);
7718 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007719}
7720
Alexander Belopolsky40018472011-02-26 01:02:56 +00007721PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007722PyUnicode_DecodeCodePageStateful(int code_page,
7723 const char *s,
7724 Py_ssize_t size,
7725 const char *errors,
7726 Py_ssize_t *consumed)
7727{
7728 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7729}
7730
7731PyObject *
7732PyUnicode_DecodeMBCSStateful(const char *s,
7733 Py_ssize_t size,
7734 const char *errors,
7735 Py_ssize_t *consumed)
7736{
7737 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7738}
7739
7740PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007741PyUnicode_DecodeMBCS(const char *s,
7742 Py_ssize_t size,
7743 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007744{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007745 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7746}
7747
Victor Stinner3a50e702011-10-18 21:21:00 +02007748static DWORD
7749encode_code_page_flags(UINT code_page, const char *errors)
7750{
7751 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007752 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007753 }
7754 else if (code_page == CP_UTF7) {
7755 /* CP_UTF7 only supports flags=0 */
7756 return 0;
7757 }
7758 else {
7759 if (errors != NULL && strcmp(errors, "replace") == 0)
7760 return 0;
7761 else
7762 return WC_NO_BEST_FIT_CHARS;
7763 }
7764}
7765
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007766/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007767 * Encode a Unicode string to a Windows code page into a byte string in strict
7768 * mode.
7769 *
7770 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007771 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007772 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007773static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007774encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007775 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007776 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007777{
Victor Stinner554f3f02010-06-16 23:33:54 +00007778 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007779 BOOL *pusedDefaultChar = &usedDefaultChar;
7780 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007781 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007782 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007783 const DWORD flags = encode_code_page_flags(code_page, NULL);
7784 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007785 /* Create a substring so that we can get the UTF-16 representation
7786 of just the slice under consideration. */
7787 PyObject *substring;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007788 int ret = -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007789
Martin v. Löwis3d325192011-11-04 18:23:06 +01007790 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007791
Victor Stinner3a50e702011-10-18 21:21:00 +02007792 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007793 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007794 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007795 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007796
Victor Stinner2fc507f2011-11-04 20:06:39 +01007797 substring = PyUnicode_Substring(unicode, offset, offset+len);
7798 if (substring == NULL)
7799 return -1;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007800#if USE_UNICODE_WCHAR_CACHE
7801_Py_COMP_DIAG_PUSH
7802_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Victor Stinner2fc507f2011-11-04 20:06:39 +01007803 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7804 if (p == NULL) {
7805 Py_DECREF(substring);
7806 return -1;
7807 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007808_Py_COMP_DIAG_POP
7809#else /* USE_UNICODE_WCHAR_CACHE */
7810 p = PyUnicode_AsWideCharString(substring, &size);
7811 Py_CLEAR(substring);
7812 if (p == NULL) {
7813 return -1;
7814 }
7815#endif /* USE_UNICODE_WCHAR_CACHE */
Victor Stinner9f067f42013-06-05 00:21:31 +02007816 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007817
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007818 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007819 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007820 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007821 NULL, 0,
7822 NULL, pusedDefaultChar);
7823 if (outsize <= 0)
7824 goto error;
7825 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007826 if (pusedDefaultChar && *pusedDefaultChar) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007827 ret = -2;
7828 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007829 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007830
Victor Stinner3a50e702011-10-18 21:21:00 +02007831 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007832 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007833 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007834 if (*outbytes == NULL) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007835 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007836 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007837 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007838 }
7839 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007840 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007841 const Py_ssize_t n = PyBytes_Size(*outbytes);
7842 if (outsize > PY_SSIZE_T_MAX - n) {
7843 PyErr_NoMemory();
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007844 goto done;
Victor Stinner3a50e702011-10-18 21:21:00 +02007845 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007846 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007847 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007848 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007849 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007850 }
7851
7852 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007853 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007854 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007855 out, outsize,
7856 NULL, pusedDefaultChar);
7857 if (outsize <= 0)
7858 goto error;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007859 if (pusedDefaultChar && *pusedDefaultChar) {
7860 ret = -2;
7861 goto done;
7862 }
7863 ret = 0;
7864
7865done:
7866#if USE_UNICODE_WCHAR_CACHE
7867 Py_DECREF(substring);
7868#else /* USE_UNICODE_WCHAR_CACHE */
7869 PyMem_Free(p);
7870#endif /* USE_UNICODE_WCHAR_CACHE */
7871 return ret;
Victor Stinner554f3f02010-06-16 23:33:54 +00007872
Victor Stinner3a50e702011-10-18 21:21:00 +02007873error:
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007874 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7875 ret = -2;
7876 goto done;
7877 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007878 PyErr_SetFromWindowsErr(0);
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007879 goto done;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007880}
7881
Victor Stinner3a50e702011-10-18 21:21:00 +02007882/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007883 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007884 * error handler.
7885 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007886 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007887 * -1 on other error.
7888 */
7889static int
7890encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007891 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007892 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007893{
Victor Stinner3a50e702011-10-18 21:21:00 +02007894 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007895 Py_ssize_t pos = unicode_offset;
7896 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007897 /* Ideally, we should get reason from FormatMessage. This is the Windows
7898 2000 English version of the message. */
7899 const char *reason = "invalid character";
7900 /* 4=maximum length of a UTF-8 sequence */
7901 char buffer[4];
7902 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7903 Py_ssize_t outsize;
7904 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007905 PyObject *errorHandler = NULL;
7906 PyObject *exc = NULL;
7907 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007908 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007909 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007910 PyObject *rep;
7911 int ret = -1;
7912
7913 assert(insize > 0);
7914
7915 encoding = code_page_name(code_page, &encoding_obj);
7916 if (encoding == NULL)
7917 return -1;
7918
7919 if (errors == NULL || strcmp(errors, "strict") == 0) {
7920 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7921 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007922 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007923 if (exc != NULL) {
7924 PyCodec_StrictErrors(exc);
7925 Py_DECREF(exc);
7926 }
7927 Py_XDECREF(encoding_obj);
7928 return -1;
7929 }
7930
7931 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7932 pusedDefaultChar = &usedDefaultChar;
7933 else
7934 pusedDefaultChar = NULL;
7935
7936 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7937 PyErr_NoMemory();
7938 goto error;
7939 }
7940 outsize = insize * Py_ARRAY_LENGTH(buffer);
7941
7942 if (*outbytes == NULL) {
7943 /* Create string object */
7944 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7945 if (*outbytes == NULL)
7946 goto error;
7947 out = PyBytes_AS_STRING(*outbytes);
7948 }
7949 else {
7950 /* Extend string object */
7951 Py_ssize_t n = PyBytes_Size(*outbytes);
7952 if (n > PY_SSIZE_T_MAX - outsize) {
7953 PyErr_NoMemory();
7954 goto error;
7955 }
7956 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7957 goto error;
7958 out = PyBytes_AS_STRING(*outbytes) + n;
7959 }
7960
7961 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007962 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007963 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007964 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7965 wchar_t chars[2];
7966 int charsize;
7967 if (ch < 0x10000) {
7968 chars[0] = (wchar_t)ch;
7969 charsize = 1;
7970 }
7971 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007972 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7973 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007974 charsize = 2;
7975 }
7976
Victor Stinner3a50e702011-10-18 21:21:00 +02007977 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007978 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007979 buffer, Py_ARRAY_LENGTH(buffer),
7980 NULL, pusedDefaultChar);
7981 if (outsize > 0) {
7982 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7983 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007984 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007985 memcpy(out, buffer, outsize);
7986 out += outsize;
7987 continue;
7988 }
7989 }
7990 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7991 PyErr_SetFromWindowsErr(0);
7992 goto error;
7993 }
7994
Victor Stinner3a50e702011-10-18 21:21:00 +02007995 rep = unicode_encode_call_errorhandler(
7996 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007997 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007998 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007999 if (rep == NULL)
8000 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01008001 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02008002
8003 if (PyBytes_Check(rep)) {
8004 outsize = PyBytes_GET_SIZE(rep);
8005 if (outsize != 1) {
8006 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8007 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
8008 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8009 Py_DECREF(rep);
8010 goto error;
8011 }
8012 out = PyBytes_AS_STRING(*outbytes) + offset;
8013 }
8014 memcpy(out, PyBytes_AS_STRING(rep), outsize);
8015 out += outsize;
8016 }
8017 else {
8018 Py_ssize_t i;
8019 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008020 const void *data;
Victor Stinner3a50e702011-10-18 21:21:00 +02008021
Benjamin Petersonbac79492012-01-14 13:34:47 -05008022 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02008023 Py_DECREF(rep);
8024 goto error;
8025 }
8026
8027 outsize = PyUnicode_GET_LENGTH(rep);
8028 if (outsize != 1) {
8029 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8030 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
8031 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8032 Py_DECREF(rep);
8033 goto error;
8034 }
8035 out = PyBytes_AS_STRING(*outbytes) + offset;
8036 }
8037 kind = PyUnicode_KIND(rep);
8038 data = PyUnicode_DATA(rep);
8039 for (i=0; i < outsize; i++) {
8040 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8041 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008042 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008043 encoding, unicode,
8044 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02008045 "unable to encode error handler result to ASCII");
8046 Py_DECREF(rep);
8047 goto error;
8048 }
8049 *out = (unsigned char)ch;
8050 out++;
8051 }
8052 }
8053 Py_DECREF(rep);
8054 }
8055 /* write a NUL byte */
8056 *out = 0;
8057 outsize = out - PyBytes_AS_STRING(*outbytes);
8058 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
8059 if (_PyBytes_Resize(outbytes, outsize) < 0)
8060 goto error;
8061 ret = 0;
8062
8063error:
8064 Py_XDECREF(encoding_obj);
8065 Py_XDECREF(errorHandler);
8066 Py_XDECREF(exc);
8067 return ret;
8068}
8069
Victor Stinner3a50e702011-10-18 21:21:00 +02008070static PyObject *
8071encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01008072 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02008073 const char *errors)
8074{
Martin v. Löwis3d325192011-11-04 18:23:06 +01008075 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02008076 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01008077 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01008078 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01008079
Victor Stinner29dacf22015-01-26 16:41:32 +01008080 if (!PyUnicode_Check(unicode)) {
8081 PyErr_BadArgument();
8082 return NULL;
8083 }
8084
Benjamin Petersonbac79492012-01-14 13:34:47 -05008085 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01008086 return NULL;
8087 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00008088
Victor Stinner3a50e702011-10-18 21:21:00 +02008089 if (code_page < 0) {
8090 PyErr_SetString(PyExc_ValueError, "invalid code page number");
8091 return NULL;
8092 }
8093
Martin v. Löwis3d325192011-11-04 18:23:06 +01008094 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01008095 return PyBytes_FromStringAndSize(NULL, 0);
8096
Victor Stinner7581cef2011-11-03 22:32:33 +01008097 offset = 0;
8098 do
8099 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008100#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07008101 if (len > DECODING_CHUNK_SIZE) {
8102 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01008103 done = 0;
8104 }
Victor Stinner7581cef2011-11-03 22:32:33 +01008105 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008106#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01008107 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01008108 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01008109 done = 1;
8110 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01008111
Victor Stinner76a31a62011-11-04 00:05:13 +01008112 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008113 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01008114 errors);
8115 if (ret == -2)
8116 ret = encode_code_page_errors(code_page, &outbytes,
8117 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008118 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01008119 if (ret < 0) {
8120 Py_XDECREF(outbytes);
8121 return NULL;
8122 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008123
Victor Stinner7581cef2011-11-03 22:32:33 +01008124 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01008125 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01008126 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008127
Victor Stinner3a50e702011-10-18 21:21:00 +02008128 return outbytes;
8129}
8130
8131PyObject *
8132PyUnicode_EncodeMBCS(const Py_UNICODE *p,
8133 Py_ssize_t size,
8134 const char *errors)
8135{
Victor Stinner7581cef2011-11-03 22:32:33 +01008136 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008137 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01008138 if (unicode == NULL)
8139 return NULL;
8140 res = encode_code_page(CP_ACP, unicode, errors);
8141 Py_DECREF(unicode);
8142 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02008143}
8144
8145PyObject *
8146PyUnicode_EncodeCodePage(int code_page,
8147 PyObject *unicode,
8148 const char *errors)
8149{
Victor Stinner7581cef2011-11-03 22:32:33 +01008150 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008151}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00008152
Alexander Belopolsky40018472011-02-26 01:02:56 +00008153PyObject *
8154PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008155{
Victor Stinner7581cef2011-11-03 22:32:33 +01008156 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008157}
8158
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008159#undef NEED_RETRY
8160
Steve Dowercc16be82016-09-08 10:35:16 -07008161#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008162
Guido van Rossumd57fd912000-03-10 22:53:23 +00008163/* --- Character Mapping Codec -------------------------------------------- */
8164
Victor Stinnerfb161b12013-04-18 01:44:27 +02008165static int
8166charmap_decode_string(const char *s,
8167 Py_ssize_t size,
8168 PyObject *mapping,
8169 const char *errors,
8170 _PyUnicodeWriter *writer)
8171{
8172 const char *starts = s;
8173 const char *e;
8174 Py_ssize_t startinpos, endinpos;
8175 PyObject *errorHandler = NULL, *exc = NULL;
8176 Py_ssize_t maplen;
8177 enum PyUnicode_Kind mapkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008178 const void *mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008179 Py_UCS4 x;
8180 unsigned char ch;
8181
8182 if (PyUnicode_READY(mapping) == -1)
8183 return -1;
8184
8185 maplen = PyUnicode_GET_LENGTH(mapping);
8186 mapdata = PyUnicode_DATA(mapping);
8187 mapkind = PyUnicode_KIND(mapping);
8188
8189 e = s + size;
8190
8191 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8192 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8193 * is disabled in encoding aliases, latin1 is preferred because
8194 * its implementation is faster. */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008195 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008196 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8197 Py_UCS4 maxchar = writer->maxchar;
8198
8199 assert (writer->kind == PyUnicode_1BYTE_KIND);
8200 while (s < e) {
8201 ch = *s;
8202 x = mapdata_ucs1[ch];
8203 if (x > maxchar) {
8204 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8205 goto onError;
8206 maxchar = writer->maxchar;
8207 outdata = (Py_UCS1 *)writer->data;
8208 }
8209 outdata[writer->pos] = x;
8210 writer->pos++;
8211 ++s;
8212 }
8213 return 0;
8214 }
8215
8216 while (s < e) {
8217 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8218 enum PyUnicode_Kind outkind = writer->kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008219 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008220 if (outkind == PyUnicode_1BYTE_KIND) {
8221 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8222 Py_UCS4 maxchar = writer->maxchar;
8223 while (s < e) {
8224 ch = *s;
8225 x = mapdata_ucs2[ch];
8226 if (x > maxchar)
8227 goto Error;
8228 outdata[writer->pos] = x;
8229 writer->pos++;
8230 ++s;
8231 }
8232 break;
8233 }
8234 else if (outkind == PyUnicode_2BYTE_KIND) {
8235 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8236 while (s < e) {
8237 ch = *s;
8238 x = mapdata_ucs2[ch];
8239 if (x == 0xFFFE)
8240 goto Error;
8241 outdata[writer->pos] = x;
8242 writer->pos++;
8243 ++s;
8244 }
8245 break;
8246 }
8247 }
8248 ch = *s;
8249
8250 if (ch < maplen)
8251 x = PyUnicode_READ(mapkind, mapdata, ch);
8252 else
8253 x = 0xfffe; /* invalid value */
8254Error:
8255 if (x == 0xfffe)
8256 {
8257 /* undefined mapping */
8258 startinpos = s-starts;
8259 endinpos = startinpos+1;
8260 if (unicode_decode_call_errorhandler_writer(
8261 errors, &errorHandler,
8262 "charmap", "character maps to <undefined>",
8263 &starts, &e, &startinpos, &endinpos, &exc, &s,
8264 writer)) {
8265 goto onError;
8266 }
8267 continue;
8268 }
8269
8270 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8271 goto onError;
8272 ++s;
8273 }
8274 Py_XDECREF(errorHandler);
8275 Py_XDECREF(exc);
8276 return 0;
8277
8278onError:
8279 Py_XDECREF(errorHandler);
8280 Py_XDECREF(exc);
8281 return -1;
8282}
8283
8284static int
8285charmap_decode_mapping(const char *s,
8286 Py_ssize_t size,
8287 PyObject *mapping,
8288 const char *errors,
8289 _PyUnicodeWriter *writer)
8290{
8291 const char *starts = s;
8292 const char *e;
8293 Py_ssize_t startinpos, endinpos;
8294 PyObject *errorHandler = NULL, *exc = NULL;
8295 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008296 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008297
8298 e = s + size;
8299
8300 while (s < e) {
8301 ch = *s;
8302
8303 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8304 key = PyLong_FromLong((long)ch);
8305 if (key == NULL)
8306 goto onError;
8307
8308 item = PyObject_GetItem(mapping, key);
8309 Py_DECREF(key);
8310 if (item == NULL) {
8311 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8312 /* No mapping found means: mapping is undefined. */
8313 PyErr_Clear();
8314 goto Undefined;
8315 } else
8316 goto onError;
8317 }
8318
8319 /* Apply mapping */
8320 if (item == Py_None)
8321 goto Undefined;
8322 if (PyLong_Check(item)) {
8323 long value = PyLong_AS_LONG(item);
8324 if (value == 0xFFFE)
8325 goto Undefined;
8326 if (value < 0 || value > MAX_UNICODE) {
8327 PyErr_Format(PyExc_TypeError,
Max Bernstein36353882020-10-17 13:38:21 -07008328 "character mapping must be in range(0x%x)",
Victor Stinnerfb161b12013-04-18 01:44:27 +02008329 (unsigned long)MAX_UNICODE + 1);
8330 goto onError;
8331 }
8332
8333 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8334 goto onError;
8335 }
8336 else if (PyUnicode_Check(item)) {
8337 if (PyUnicode_READY(item) == -1)
8338 goto onError;
8339 if (PyUnicode_GET_LENGTH(item) == 1) {
8340 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8341 if (value == 0xFFFE)
8342 goto Undefined;
8343 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8344 goto onError;
8345 }
8346 else {
8347 writer->overallocate = 1;
8348 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8349 goto onError;
8350 }
8351 }
8352 else {
8353 /* wrong return value */
8354 PyErr_SetString(PyExc_TypeError,
8355 "character mapping must return integer, None or str");
8356 goto onError;
8357 }
8358 Py_CLEAR(item);
8359 ++s;
8360 continue;
8361
8362Undefined:
8363 /* undefined mapping */
8364 Py_CLEAR(item);
8365 startinpos = s-starts;
8366 endinpos = startinpos+1;
8367 if (unicode_decode_call_errorhandler_writer(
8368 errors, &errorHandler,
8369 "charmap", "character maps to <undefined>",
8370 &starts, &e, &startinpos, &endinpos, &exc, &s,
8371 writer)) {
8372 goto onError;
8373 }
8374 }
8375 Py_XDECREF(errorHandler);
8376 Py_XDECREF(exc);
8377 return 0;
8378
8379onError:
8380 Py_XDECREF(item);
8381 Py_XDECREF(errorHandler);
8382 Py_XDECREF(exc);
8383 return -1;
8384}
8385
Alexander Belopolsky40018472011-02-26 01:02:56 +00008386PyObject *
8387PyUnicode_DecodeCharmap(const char *s,
8388 Py_ssize_t size,
8389 PyObject *mapping,
8390 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008391{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008392 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008393
Guido van Rossumd57fd912000-03-10 22:53:23 +00008394 /* Default to Latin-1 */
8395 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008396 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008397
Guido van Rossumd57fd912000-03-10 22:53:23 +00008398 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008399 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008400 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008401 writer.min_length = size;
8402 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008403 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008404
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008405 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008406 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8407 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008408 }
8409 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008410 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8411 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008412 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008413 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008414
Benjamin Peterson29060642009-01-31 22:14:21 +00008415 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008416 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008417 return NULL;
8418}
8419
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008420/* Charmap encoding: the lookup table */
8421
Alexander Belopolsky40018472011-02-26 01:02:56 +00008422struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008423 PyObject_HEAD
8424 unsigned char level1[32];
8425 int count2, count3;
8426 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008427};
8428
8429static PyObject*
8430encoding_map_size(PyObject *obj, PyObject* args)
8431{
8432 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008433 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008434 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008435}
8436
8437static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008438 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008439 PyDoc_STR("Return the size (in bytes) of this object") },
8440 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008441};
8442
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008443static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008444 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 "EncodingMap", /*tp_name*/
8446 sizeof(struct encoding_map), /*tp_basicsize*/
8447 0, /*tp_itemsize*/
8448 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008449 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008450 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 0, /*tp_getattr*/
8452 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008453 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 0, /*tp_repr*/
8455 0, /*tp_as_number*/
8456 0, /*tp_as_sequence*/
8457 0, /*tp_as_mapping*/
8458 0, /*tp_hash*/
8459 0, /*tp_call*/
8460 0, /*tp_str*/
8461 0, /*tp_getattro*/
8462 0, /*tp_setattro*/
8463 0, /*tp_as_buffer*/
8464 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8465 0, /*tp_doc*/
8466 0, /*tp_traverse*/
8467 0, /*tp_clear*/
8468 0, /*tp_richcompare*/
8469 0, /*tp_weaklistoffset*/
8470 0, /*tp_iter*/
8471 0, /*tp_iternext*/
8472 encoding_map_methods, /*tp_methods*/
8473 0, /*tp_members*/
8474 0, /*tp_getset*/
8475 0, /*tp_base*/
8476 0, /*tp_dict*/
8477 0, /*tp_descr_get*/
8478 0, /*tp_descr_set*/
8479 0, /*tp_dictoffset*/
8480 0, /*tp_init*/
8481 0, /*tp_alloc*/
8482 0, /*tp_new*/
8483 0, /*tp_free*/
8484 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008485};
8486
8487PyObject*
8488PyUnicode_BuildEncodingMap(PyObject* string)
8489{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008490 PyObject *result;
8491 struct encoding_map *mresult;
8492 int i;
8493 int need_dict = 0;
8494 unsigned char level1[32];
8495 unsigned char level2[512];
8496 unsigned char *mlevel1, *mlevel2, *mlevel3;
8497 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008498 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008499 const void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008500 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008501 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008502
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008503 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008504 PyErr_BadArgument();
8505 return NULL;
8506 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008507 kind = PyUnicode_KIND(string);
8508 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008509 length = PyUnicode_GET_LENGTH(string);
8510 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008511 memset(level1, 0xFF, sizeof level1);
8512 memset(level2, 0xFF, sizeof level2);
8513
8514 /* If there isn't a one-to-one mapping of NULL to \0,
8515 or if there are non-BMP characters, we need to use
8516 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008517 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008518 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008519 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008520 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008521 ch = PyUnicode_READ(kind, data, i);
8522 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008523 need_dict = 1;
8524 break;
8525 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008526 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008527 /* unmapped character */
8528 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008529 l1 = ch >> 11;
8530 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008531 if (level1[l1] == 0xFF)
8532 level1[l1] = count2++;
8533 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008534 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008535 }
8536
8537 if (count2 >= 0xFF || count3 >= 0xFF)
8538 need_dict = 1;
8539
8540 if (need_dict) {
8541 PyObject *result = PyDict_New();
8542 PyObject *key, *value;
8543 if (!result)
8544 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008545 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008546 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008547 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008548 if (!key || !value)
8549 goto failed1;
8550 if (PyDict_SetItem(result, key, value) == -1)
8551 goto failed1;
8552 Py_DECREF(key);
8553 Py_DECREF(value);
8554 }
8555 return result;
8556 failed1:
8557 Py_XDECREF(key);
8558 Py_XDECREF(value);
8559 Py_DECREF(result);
8560 return NULL;
8561 }
8562
8563 /* Create a three-level trie */
Victor Stinner32bd68c2020-12-01 10:37:39 +01008564 result = PyObject_Malloc(sizeof(struct encoding_map) +
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008565 16*count2 + 128*count3 - 1);
Victor Stinner04fc4f22020-06-16 01:28:07 +02008566 if (!result) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008567 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02008568 }
8569
8570 _PyObject_Init(result, &EncodingMapType);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008571 mresult = (struct encoding_map*)result;
8572 mresult->count2 = count2;
8573 mresult->count3 = count3;
8574 mlevel1 = mresult->level1;
8575 mlevel2 = mresult->level23;
8576 mlevel3 = mresult->level23 + 16*count2;
8577 memcpy(mlevel1, level1, 32);
8578 memset(mlevel2, 0xFF, 16*count2);
8579 memset(mlevel3, 0, 128*count3);
8580 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008581 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008582 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008583 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8584 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008585 /* unmapped character */
8586 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008587 o1 = ch>>11;
8588 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008589 i2 = 16*mlevel1[o1] + o2;
8590 if (mlevel2[i2] == 0xFF)
8591 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008592 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008593 i3 = 128*mlevel2[i2] + o3;
8594 mlevel3[i3] = i;
8595 }
8596 return result;
8597}
8598
8599static int
Victor Stinner22168992011-11-20 17:09:18 +01008600encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008601{
8602 struct encoding_map *map = (struct encoding_map*)mapping;
8603 int l1 = c>>11;
8604 int l2 = (c>>7) & 0xF;
8605 int l3 = c & 0x7F;
8606 int i;
8607
Victor Stinner22168992011-11-20 17:09:18 +01008608 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008609 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008610 if (c == 0)
8611 return 0;
8612 /* level 1*/
8613 i = map->level1[l1];
8614 if (i == 0xFF) {
8615 return -1;
8616 }
8617 /* level 2*/
8618 i = map->level23[16*i+l2];
8619 if (i == 0xFF) {
8620 return -1;
8621 }
8622 /* level 3 */
8623 i = map->level23[16*map->count2 + 128*i + l3];
8624 if (i == 0) {
8625 return -1;
8626 }
8627 return i;
8628}
8629
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008630/* Lookup the character ch in the mapping. If the character
8631 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008632 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008633static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008634charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008635{
Christian Heimes217cfd12007-12-02 14:31:20 +00008636 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008637 PyObject *x;
8638
8639 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008640 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008641 x = PyObject_GetItem(mapping, w);
8642 Py_DECREF(w);
8643 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008644 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8645 /* No mapping found means: mapping is undefined. */
8646 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008647 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008648 } else
8649 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008650 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008651 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008652 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008653 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008654 long value = PyLong_AS_LONG(x);
8655 if (value < 0 || value > 255) {
8656 PyErr_SetString(PyExc_TypeError,
8657 "character mapping must be in range(256)");
8658 Py_DECREF(x);
8659 return NULL;
8660 }
8661 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008662 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008663 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008664 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008665 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008666 /* wrong return value */
8667 PyErr_Format(PyExc_TypeError,
8668 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008669 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008670 Py_DECREF(x);
8671 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008672 }
8673}
8674
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008675static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008676charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008677{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008678 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8679 /* exponentially overallocate to minimize reallocations */
8680 if (requiredsize < 2*outsize)
8681 requiredsize = 2*outsize;
8682 if (_PyBytes_Resize(outobj, requiredsize))
8683 return -1;
8684 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008685}
8686
Benjamin Peterson14339b62009-01-31 16:36:08 +00008687typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008688 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008689} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008690/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008691 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008692 space is available. Return a new reference to the object that
8693 was put in the output buffer, or Py_None, if the mapping was undefined
8694 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008695 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008696static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008697charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008698 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008699{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008700 PyObject *rep;
8701 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008702 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008703
Andy Lesterdffe4c02020-03-04 07:15:20 -06008704 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008705 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008706 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008707 if (res == -1)
8708 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008709 if (outsize<requiredsize)
8710 if (charmapencode_resize(outobj, outpos, requiredsize))
8711 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008712 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008713 outstart[(*outpos)++] = (char)res;
8714 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008715 }
8716
8717 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008718 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008719 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008720 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 Py_DECREF(rep);
8722 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008723 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008724 if (PyLong_Check(rep)) {
8725 Py_ssize_t requiredsize = *outpos+1;
8726 if (outsize<requiredsize)
8727 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8728 Py_DECREF(rep);
8729 return enc_EXCEPTION;
8730 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008731 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008732 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008733 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 else {
8735 const char *repchars = PyBytes_AS_STRING(rep);
8736 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8737 Py_ssize_t requiredsize = *outpos+repsize;
8738 if (outsize<requiredsize)
8739 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8740 Py_DECREF(rep);
8741 return enc_EXCEPTION;
8742 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008743 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008744 memcpy(outstart + *outpos, repchars, repsize);
8745 *outpos += repsize;
8746 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008747 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008748 Py_DECREF(rep);
8749 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008750}
8751
8752/* handle an error in PyUnicode_EncodeCharmap
8753 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008754static int
8755charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008756 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008757 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008758 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008759 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008760{
8761 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008762 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008763 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008764 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008765 const void *data;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008766 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008767 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008768 Py_ssize_t collstartpos = *inpos;
8769 Py_ssize_t collendpos = *inpos+1;
8770 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008771 const char *encoding = "charmap";
8772 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008773 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008774 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008775 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008776
Benjamin Petersonbac79492012-01-14 13:34:47 -05008777 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008778 return -1;
8779 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008780 /* find all unencodable characters */
8781 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008782 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008783 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008784 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008785 val = encoding_map_lookup(ch, mapping);
8786 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008787 break;
8788 ++collendpos;
8789 continue;
8790 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008791
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008792 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8793 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008794 if (rep==NULL)
8795 return -1;
8796 else if (rep!=Py_None) {
8797 Py_DECREF(rep);
8798 break;
8799 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008800 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008801 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008802 }
8803 /* cache callback name lookup
8804 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008805 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008806 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008807
8808 switch (*error_handler) {
8809 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008810 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008811 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008812
8813 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008814 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008815 x = charmapencode_output('?', mapping, res, respos);
8816 if (x==enc_EXCEPTION) {
8817 return -1;
8818 }
8819 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008820 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008821 return -1;
8822 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008823 }
8824 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008825 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008826 *inpos = collendpos;
8827 break;
Victor Stinner50149202015-09-22 00:26:54 +02008828
8829 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008830 /* generate replacement (temporarily (mis)uses p) */
8831 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008832 char buffer[2+29+1+1];
8833 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008834 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008835 for (cp = buffer; *cp; ++cp) {
8836 x = charmapencode_output(*cp, mapping, res, respos);
8837 if (x==enc_EXCEPTION)
8838 return -1;
8839 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008840 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008841 return -1;
8842 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008843 }
8844 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008845 *inpos = collendpos;
8846 break;
Victor Stinner50149202015-09-22 00:26:54 +02008847
Benjamin Peterson14339b62009-01-31 16:36:08 +00008848 default:
Victor Stinner50149202015-09-22 00:26:54 +02008849 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008850 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008851 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008852 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008853 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008854 if (PyBytes_Check(repunicode)) {
8855 /* Directly copy bytes result to output. */
8856 Py_ssize_t outsize = PyBytes_Size(*res);
8857 Py_ssize_t requiredsize;
8858 repsize = PyBytes_Size(repunicode);
8859 requiredsize = *respos + repsize;
8860 if (requiredsize > outsize)
8861 /* Make room for all additional bytes. */
8862 if (charmapencode_resize(res, respos, requiredsize)) {
8863 Py_DECREF(repunicode);
8864 return -1;
8865 }
8866 memcpy(PyBytes_AsString(*res) + *respos,
8867 PyBytes_AsString(repunicode), repsize);
8868 *respos += repsize;
8869 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008870 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008871 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008872 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008873 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008874 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008875 Py_DECREF(repunicode);
8876 return -1;
8877 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008878 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008879 data = PyUnicode_DATA(repunicode);
8880 kind = PyUnicode_KIND(repunicode);
8881 for (index = 0; index < repsize; index++) {
8882 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8883 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008884 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008885 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008886 return -1;
8887 }
8888 else if (x==enc_FAILED) {
8889 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008890 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008891 return -1;
8892 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008893 }
8894 *inpos = newpos;
8895 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008896 }
8897 return 0;
8898}
8899
Alexander Belopolsky40018472011-02-26 01:02:56 +00008900PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008901_PyUnicode_EncodeCharmap(PyObject *unicode,
8902 PyObject *mapping,
8903 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008904{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008905 /* output object */
8906 PyObject *res = NULL;
8907 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008908 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008909 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008910 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008911 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008912 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008913 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008914 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008915 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008916 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008917
Benjamin Petersonbac79492012-01-14 13:34:47 -05008918 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008919 return NULL;
8920 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008921 data = PyUnicode_DATA(unicode);
8922 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008923
Guido van Rossumd57fd912000-03-10 22:53:23 +00008924 /* Default to Latin-1 */
8925 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008926 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008928 /* allocate enough for a simple encoding without
8929 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008930 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008931 if (res == NULL)
8932 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008933 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008934 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008935
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008936 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008937 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008938 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008939 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008940 if (x==enc_EXCEPTION) /* error */
8941 goto onError;
8942 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008943 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008944 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008945 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008946 &res, &respos)) {
8947 goto onError;
8948 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008949 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008950 else
8951 /* done with this character => adjust input position */
8952 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008953 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008954
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008955 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008956 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008957 if (_PyBytes_Resize(&res, respos) < 0)
8958 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008959
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008960 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008961 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008962 return res;
8963
Benjamin Peterson29060642009-01-31 22:14:21 +00008964 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008965 Py_XDECREF(res);
8966 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008967 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008968 return NULL;
8969}
8970
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008971/* Deprecated */
8972PyObject *
8973PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8974 Py_ssize_t size,
8975 PyObject *mapping,
8976 const char *errors)
8977{
8978 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008979 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008980 if (unicode == NULL)
8981 return NULL;
8982 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8983 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008984 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008985}
8986
Alexander Belopolsky40018472011-02-26 01:02:56 +00008987PyObject *
8988PyUnicode_AsCharmapString(PyObject *unicode,
8989 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008990{
8991 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008992 PyErr_BadArgument();
8993 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008994 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008995 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008996}
8997
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008998/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008999static void
9000make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009001 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009002 Py_ssize_t startpos, Py_ssize_t endpos,
9003 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009004{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009005 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009006 *exceptionObject = _PyUnicodeTranslateError_Create(
9007 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009008 }
9009 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009010 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9011 goto onError;
9012 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9013 goto onError;
9014 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9015 goto onError;
9016 return;
9017 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02009018 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009019 }
9020}
9021
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009022/* error handling callback helper:
9023 build arguments, call the callback and check the arguments,
9024 put the result into newpos and return the replacement string, which
9025 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009026static PyObject *
9027unicode_translate_call_errorhandler(const char *errors,
9028 PyObject **errorHandler,
9029 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009030 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009031 Py_ssize_t startpos, Py_ssize_t endpos,
9032 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009033{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009034 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009035
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009036 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009037 PyObject *restuple;
9038 PyObject *resunicode;
9039
9040 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009041 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009042 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009043 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009044 }
9045
9046 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009047 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009048 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009049 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009050
Petr Viktorinffd97532020-02-11 17:46:57 +01009051 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009052 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009053 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009054 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009055 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00009056 Py_DECREF(restuple);
9057 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009058 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009059 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00009060 &resunicode, &i_newpos)) {
9061 Py_DECREF(restuple);
9062 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009063 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00009064 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009065 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009066 else
9067 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02009069 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00009070 Py_DECREF(restuple);
9071 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00009072 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009073 Py_INCREF(resunicode);
9074 Py_DECREF(restuple);
9075 return resunicode;
9076}
9077
9078/* Lookup the character ch in the mapping and put the result in result,
9079 which must be decrefed by the caller.
9080 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009081static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009082charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009083{
Christian Heimes217cfd12007-12-02 14:31:20 +00009084 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009085 PyObject *x;
9086
9087 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009088 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009089 x = PyObject_GetItem(mapping, w);
9090 Py_DECREF(w);
9091 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009092 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9093 /* No mapping found means: use 1:1 mapping. */
9094 PyErr_Clear();
9095 *result = NULL;
9096 return 0;
9097 } else
9098 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009099 }
9100 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009101 *result = x;
9102 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009103 }
Christian Heimes217cfd12007-12-02 14:31:20 +00009104 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009105 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009106 if (value < 0 || value > MAX_UNICODE) {
9107 PyErr_Format(PyExc_ValueError,
9108 "character mapping must be in range(0x%x)",
9109 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00009110 Py_DECREF(x);
9111 return -1;
9112 }
9113 *result = x;
9114 return 0;
9115 }
9116 else if (PyUnicode_Check(x)) {
9117 *result = x;
9118 return 0;
9119 }
9120 else {
9121 /* wrong return value */
9122 PyErr_SetString(PyExc_TypeError,
9123 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009124 Py_DECREF(x);
9125 return -1;
9126 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009127}
Victor Stinner1194ea02014-04-04 19:37:40 +02009128
9129/* lookup the character, write the result into the writer.
9130 Return 1 if the result was written into the writer, return 0 if the mapping
9131 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009132static int
Victor Stinner1194ea02014-04-04 19:37:40 +02009133charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9134 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009135{
Victor Stinner1194ea02014-04-04 19:37:40 +02009136 PyObject *item;
9137
9138 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00009139 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009140
9141 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009142 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02009143 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009144 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009145 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009146 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009147 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009148
9149 if (item == Py_None) {
9150 Py_DECREF(item);
9151 return 0;
9152 }
9153
9154 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02009155 long ch = (Py_UCS4)PyLong_AS_LONG(item);
9156 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9157 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009158 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9159 Py_DECREF(item);
9160 return -1;
9161 }
9162 Py_DECREF(item);
9163 return 1;
9164 }
9165
9166 if (!PyUnicode_Check(item)) {
9167 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00009168 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009169 }
9170
9171 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9172 Py_DECREF(item);
9173 return -1;
9174 }
9175
9176 Py_DECREF(item);
9177 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009178}
9179
Victor Stinner89a76ab2014-04-05 11:44:04 +02009180static int
9181unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9182 Py_UCS1 *translate)
9183{
Benjamin Peterson1365de72014-04-07 20:15:41 -04009184 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009185 int ret = 0;
9186
Victor Stinner89a76ab2014-04-05 11:44:04 +02009187 if (charmaptranslate_lookup(ch, mapping, &item)) {
9188 return -1;
9189 }
9190
9191 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009192 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02009193 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009194 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009195 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009196 /* not found => default to 1:1 mapping */
9197 translate[ch] = ch;
9198 return 1;
9199 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009200 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02009201 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009202 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9203 used it */
9204 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009205 /* invalid character or character outside ASCII:
9206 skip the fast translate */
9207 goto exit;
9208 }
9209 translate[ch] = (Py_UCS1)replace;
9210 }
9211 else if (PyUnicode_Check(item)) {
9212 Py_UCS4 replace;
9213
9214 if (PyUnicode_READY(item) == -1) {
9215 Py_DECREF(item);
9216 return -1;
9217 }
9218 if (PyUnicode_GET_LENGTH(item) != 1)
9219 goto exit;
9220
9221 replace = PyUnicode_READ_CHAR(item, 0);
9222 if (replace > 127)
9223 goto exit;
9224 translate[ch] = (Py_UCS1)replace;
9225 }
9226 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009227 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009228 goto exit;
9229 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009230 ret = 1;
9231
Benjamin Peterson1365de72014-04-07 20:15:41 -04009232 exit:
9233 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009234 return ret;
9235}
9236
9237/* Fast path for ascii => ascii translation. Return 1 if the whole string
9238 was translated into writer, return 0 if the input string was partially
9239 translated into writer, raise an exception and return -1 on error. */
9240static int
9241unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009242 _PyUnicodeWriter *writer, int ignore,
9243 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009244{
Victor Stinner872b2912014-04-05 14:27:07 +02009245 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009246 Py_ssize_t len;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009247 const Py_UCS1 *in, *end;
9248 Py_UCS1 *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009249 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009250
Victor Stinner89a76ab2014-04-05 11:44:04 +02009251 len = PyUnicode_GET_LENGTH(input);
9252
Victor Stinner872b2912014-04-05 14:27:07 +02009253 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009254
9255 in = PyUnicode_1BYTE_DATA(input);
9256 end = in + len;
9257
9258 assert(PyUnicode_IS_ASCII(writer->buffer));
9259 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9260 out = PyUnicode_1BYTE_DATA(writer->buffer);
9261
Victor Stinner872b2912014-04-05 14:27:07 +02009262 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009263 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009264 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009265 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009266 int translate = unicode_fast_translate_lookup(mapping, ch,
9267 ascii_table);
9268 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009269 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009270 if (translate == 0)
9271 goto exit;
9272 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009273 }
Victor Stinner872b2912014-04-05 14:27:07 +02009274 if (ch2 == 0xfe) {
9275 if (ignore)
9276 continue;
9277 goto exit;
9278 }
9279 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009280 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009281 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009282 }
Victor Stinner872b2912014-04-05 14:27:07 +02009283 res = 1;
9284
9285exit:
9286 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009287 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009288 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009289}
9290
Victor Stinner3222da22015-10-01 22:07:32 +02009291static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009292_PyUnicode_TranslateCharmap(PyObject *input,
9293 PyObject *mapping,
9294 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009295{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009296 /* input object */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009297 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009298 Py_ssize_t size, i;
9299 int kind;
9300 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009301 _PyUnicodeWriter writer;
9302 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009303 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009304 PyObject *errorHandler = NULL;
9305 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009306 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009307 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009308
Guido van Rossumd57fd912000-03-10 22:53:23 +00009309 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009310 PyErr_BadArgument();
9311 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009312 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009313
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009314 if (PyUnicode_READY(input) == -1)
9315 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009316 data = PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009317 kind = PyUnicode_KIND(input);
9318 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009319
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009320 if (size == 0)
9321 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009322
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009323 /* allocate enough for a simple 1:1 translation without
9324 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009325 _PyUnicodeWriter_Init(&writer);
9326 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009327 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009328
Victor Stinner872b2912014-04-05 14:27:07 +02009329 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9330
Victor Stinner33798672016-03-01 21:59:58 +01009331 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009332 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009333 if (PyUnicode_IS_ASCII(input)) {
9334 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9335 if (res < 0) {
9336 _PyUnicodeWriter_Dealloc(&writer);
9337 return NULL;
9338 }
9339 if (res == 1)
9340 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009341 }
Victor Stinner33798672016-03-01 21:59:58 +01009342 else {
9343 i = 0;
9344 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009345
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009346 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009347 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009348 int translate;
9349 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9350 Py_ssize_t newpos;
9351 /* startpos for collecting untranslatable chars */
9352 Py_ssize_t collstart;
9353 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009354 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009355
Victor Stinner1194ea02014-04-04 19:37:40 +02009356 ch = PyUnicode_READ(kind, data, i);
9357 translate = charmaptranslate_output(ch, mapping, &writer);
9358 if (translate < 0)
9359 goto onError;
9360
9361 if (translate != 0) {
9362 /* it worked => adjust input pointer */
9363 ++i;
9364 continue;
9365 }
9366
9367 /* untranslatable character */
9368 collstart = i;
9369 collend = i+1;
9370
9371 /* find all untranslatable characters */
9372 while (collend < size) {
9373 PyObject *x;
9374 ch = PyUnicode_READ(kind, data, collend);
9375 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009376 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009377 Py_XDECREF(x);
9378 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009379 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009380 ++collend;
9381 }
9382
9383 if (ignore) {
9384 i = collend;
9385 }
9386 else {
9387 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9388 reason, input, &exc,
9389 collstart, collend, &newpos);
9390 if (repunicode == NULL)
9391 goto onError;
9392 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009393 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009394 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009395 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009396 Py_DECREF(repunicode);
9397 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009398 }
9399 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009400 Py_XDECREF(exc);
9401 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009402 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009403
Benjamin Peterson29060642009-01-31 22:14:21 +00009404 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009405 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009406 Py_XDECREF(exc);
9407 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009408 return NULL;
9409}
9410
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009411/* Deprecated. Use PyUnicode_Translate instead. */
9412PyObject *
9413PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9414 Py_ssize_t size,
9415 PyObject *mapping,
9416 const char *errors)
9417{
Christian Heimes5f520f42012-09-11 14:03:25 +02009418 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009419 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009420 if (!unicode)
9421 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009422 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9423 Py_DECREF(unicode);
9424 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009425}
9426
Alexander Belopolsky40018472011-02-26 01:02:56 +00009427PyObject *
9428PyUnicode_Translate(PyObject *str,
9429 PyObject *mapping,
9430 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009431{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009432 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009433 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009434 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009435}
Tim Petersced69f82003-09-16 20:30:58 +00009436
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009437PyObject *
9438_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9439{
9440 if (!PyUnicode_Check(unicode)) {
9441 PyErr_BadInternalCall();
9442 return NULL;
9443 }
9444 if (PyUnicode_READY(unicode) == -1)
9445 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009446 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009447 /* If the string is already ASCII, just return the same string */
9448 Py_INCREF(unicode);
9449 return unicode;
9450 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009451
9452 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9453 PyObject *result = PyUnicode_New(len, 127);
9454 if (result == NULL) {
9455 return NULL;
9456 }
9457
9458 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9459 int kind = PyUnicode_KIND(unicode);
9460 const void *data = PyUnicode_DATA(unicode);
9461 Py_ssize_t i;
9462 for (i = 0; i < len; ++i) {
9463 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9464 if (ch < 127) {
9465 out[i] = ch;
9466 }
9467 else if (Py_UNICODE_ISSPACE(ch)) {
9468 out[i] = ' ';
9469 }
9470 else {
9471 int decimal = Py_UNICODE_TODECIMAL(ch);
9472 if (decimal < 0) {
9473 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009474 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009475 _PyUnicode_LENGTH(result) = i + 1;
9476 break;
9477 }
9478 out[i] = '0' + decimal;
9479 }
9480 }
9481
INADA Naoki16dfca42018-07-14 12:06:43 +09009482 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009483 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009484}
9485
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009486PyObject *
9487PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9488 Py_ssize_t length)
9489{
Victor Stinnerf0124502011-11-21 23:12:56 +01009490 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009491 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009492 Py_UCS4 maxchar;
9493 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009494 const void *data;
Victor Stinnerf0124502011-11-21 23:12:56 +01009495
Victor Stinner99d7ad02012-02-22 13:37:39 +01009496 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009497 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009498 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009499 if (ch > 127) {
9500 int decimal = Py_UNICODE_TODECIMAL(ch);
9501 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009502 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009503 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009504 }
9505 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009506
9507 /* Copy to a new string */
9508 decimal = PyUnicode_New(length, maxchar);
9509 if (decimal == NULL)
9510 return decimal;
9511 kind = PyUnicode_KIND(decimal);
9512 data = PyUnicode_DATA(decimal);
9513 /* Iterate over code points */
9514 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009515 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009516 if (ch > 127) {
9517 int decimal = Py_UNICODE_TODECIMAL(ch);
9518 if (decimal >= 0)
9519 ch = '0' + decimal;
9520 }
9521 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009522 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009523 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009524}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009525/* --- Decimal Encoder ---------------------------------------------------- */
9526
Alexander Belopolsky40018472011-02-26 01:02:56 +00009527int
9528PyUnicode_EncodeDecimal(Py_UNICODE *s,
9529 Py_ssize_t length,
9530 char *output,
9531 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009532{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009533 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009534 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009535 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009536 const void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009537
9538 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009539 PyErr_BadArgument();
9540 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009541 }
9542
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009543 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009544 if (unicode == NULL)
9545 return -1;
9546
Victor Stinner42bf7752011-11-21 22:52:58 +01009547 kind = PyUnicode_KIND(unicode);
9548 data = PyUnicode_DATA(unicode);
9549
Victor Stinnerb84d7232011-11-22 01:50:07 +01009550 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009551 PyObject *exc;
9552 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009553 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009554 Py_ssize_t startpos;
9555
9556 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009557
Benjamin Peterson29060642009-01-31 22:14:21 +00009558 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009559 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009560 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009561 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009562 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009563 decimal = Py_UNICODE_TODECIMAL(ch);
9564 if (decimal >= 0) {
9565 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009566 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009567 continue;
9568 }
9569 if (0 < ch && ch < 256) {
9570 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009571 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009572 continue;
9573 }
Victor Stinner6345be92011-11-25 20:09:01 +01009574
Victor Stinner42bf7752011-11-21 22:52:58 +01009575 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009576 exc = NULL;
9577 raise_encode_exception(&exc, "decimal", unicode,
9578 startpos, startpos+1,
9579 "invalid decimal Unicode string");
9580 Py_XDECREF(exc);
9581 Py_DECREF(unicode);
9582 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009583 }
9584 /* 0-terminate the output string */
9585 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009586 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009587 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009588}
9589
Guido van Rossumd57fd912000-03-10 22:53:23 +00009590/* --- Helpers ------------------------------------------------------------ */
9591
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009592/* helper macro to fixup start/end slice values */
9593#define ADJUST_INDICES(start, end, len) \
9594 if (end > len) \
9595 end = len; \
9596 else if (end < 0) { \
9597 end += len; \
9598 if (end < 0) \
9599 end = 0; \
9600 } \
9601 if (start < 0) { \
9602 start += len; \
9603 if (start < 0) \
9604 start = 0; \
9605 }
9606
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009607static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009608any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009609 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009610 Py_ssize_t end,
9611 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009612{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009613 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009614 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009615 Py_ssize_t len1, len2, result;
9616
9617 kind1 = PyUnicode_KIND(s1);
9618 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009619 if (kind1 < kind2)
9620 return -1;
9621
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009622 len1 = PyUnicode_GET_LENGTH(s1);
9623 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009624 ADJUST_INDICES(start, end, len1);
9625 if (end - start < len2)
9626 return -1;
9627
9628 buf1 = PyUnicode_DATA(s1);
9629 buf2 = PyUnicode_DATA(s2);
9630 if (len2 == 1) {
9631 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9632 result = findchar((const char *)buf1 + kind1*start,
9633 kind1, end - start, ch, direction);
9634 if (result == -1)
9635 return -1;
9636 else
9637 return start + result;
9638 }
9639
9640 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009641 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009642 if (!buf2)
9643 return -2;
9644 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009645
Victor Stinner794d5672011-10-10 03:21:36 +02009646 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009647 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009648 case PyUnicode_1BYTE_KIND:
9649 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9650 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9651 else
9652 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9653 break;
9654 case PyUnicode_2BYTE_KIND:
9655 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9656 break;
9657 case PyUnicode_4BYTE_KIND:
9658 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9659 break;
9660 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009661 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009662 }
9663 }
9664 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009665 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009666 case PyUnicode_1BYTE_KIND:
9667 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9668 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9669 else
9670 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9671 break;
9672 case PyUnicode_2BYTE_KIND:
9673 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9674 break;
9675 case PyUnicode_4BYTE_KIND:
9676 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9677 break;
9678 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009679 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009680 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009681 }
9682
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009683 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009684 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009685 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009686
9687 return result;
9688}
9689
Victor Stinner59423e32018-11-26 13:40:01 +01009690/* _PyUnicode_InsertThousandsGrouping() helper functions */
9691#include "stringlib/localeutil.h"
9692
9693/**
9694 * InsertThousandsGrouping:
9695 * @writer: Unicode writer.
9696 * @n_buffer: Number of characters in @buffer.
9697 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9698 * @d_pos: Start of digits string.
9699 * @n_digits: The number of digits in the string, in which we want
9700 * to put the grouping chars.
9701 * @min_width: The minimum width of the digits in the output string.
9702 * Output will be zero-padded on the left to fill.
9703 * @grouping: see definition in localeconv().
9704 * @thousands_sep: see definition in localeconv().
9705 *
9706 * There are 2 modes: counting and filling. If @writer is NULL,
9707 * we are in counting mode, else filling mode.
9708 * If counting, the required buffer size is returned.
9709 * If filling, we know the buffer will be large enough, so we don't
9710 * need to pass in the buffer size.
9711 * Inserts thousand grouping characters (as defined by grouping and
9712 * thousands_sep) into @writer.
9713 *
9714 * Return value: -1 on error, number of characters otherwise.
9715 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009716Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009717_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009718 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009719 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009720 PyObject *digits,
9721 Py_ssize_t d_pos,
9722 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009723 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009724 const char *grouping,
9725 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009726 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009727{
Xtreak3f7983a2019-01-07 20:39:14 +05309728 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009729 if (writer) {
9730 assert(digits != NULL);
9731 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009732 }
9733 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009734 assert(digits == NULL);
9735 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009736 }
Victor Stinner59423e32018-11-26 13:40:01 +01009737 assert(0 <= d_pos);
9738 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009739 assert(grouping != NULL);
9740
9741 if (digits != NULL) {
9742 if (PyUnicode_READY(digits) == -1) {
9743 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009744 }
Victor Stinner59423e32018-11-26 13:40:01 +01009745 }
9746 if (PyUnicode_READY(thousands_sep) == -1) {
9747 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009748 }
9749
Victor Stinner59423e32018-11-26 13:40:01 +01009750 Py_ssize_t count = 0;
9751 Py_ssize_t n_zeros;
9752 int loop_broken = 0;
9753 int use_separator = 0; /* First time through, don't append the
9754 separator. They only go between
9755 groups. */
9756 Py_ssize_t buffer_pos;
9757 Py_ssize_t digits_pos;
9758 Py_ssize_t len;
9759 Py_ssize_t n_chars;
9760 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9761 be looked at */
9762 /* A generator that returns all of the grouping widths, until it
9763 returns 0. */
9764 GroupGenerator groupgen;
9765 GroupGenerator_init(&groupgen, grouping);
9766 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9767
9768 /* if digits are not grouped, thousands separator
9769 should be an empty string */
9770 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9771
9772 digits_pos = d_pos + n_digits;
9773 if (writer) {
9774 buffer_pos = writer->pos + n_buffer;
9775 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9776 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009777 }
Victor Stinner59423e32018-11-26 13:40:01 +01009778 else {
9779 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009780 }
Victor Stinner59423e32018-11-26 13:40:01 +01009781
9782 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009783 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009784 }
Victor Stinner59423e32018-11-26 13:40:01 +01009785
9786 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9787 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9788 n_zeros = Py_MAX(0, len - remaining);
9789 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9790
9791 /* Use n_zero zero's and n_chars chars */
9792
9793 /* Count only, don't do anything. */
9794 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9795
9796 /* Copy into the writer. */
9797 InsertThousandsGrouping_fill(writer, &buffer_pos,
9798 digits, &digits_pos,
9799 n_chars, n_zeros,
9800 use_separator ? thousands_sep : NULL,
9801 thousands_sep_len, maxchar);
9802
9803 /* Use a separator next time. */
9804 use_separator = 1;
9805
9806 remaining -= n_chars;
9807 min_width -= len;
9808
9809 if (remaining <= 0 && min_width <= 0) {
9810 loop_broken = 1;
9811 break;
9812 }
9813 min_width -= thousands_sep_len;
9814 }
9815 if (!loop_broken) {
9816 /* We left the loop without using a break statement. */
9817
9818 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9819 n_zeros = Py_MAX(0, len - remaining);
9820 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9821
9822 /* Use n_zero zero's and n_chars chars */
9823 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9824
9825 /* Copy into the writer. */
9826 InsertThousandsGrouping_fill(writer, &buffer_pos,
9827 digits, &digits_pos,
9828 n_chars, n_zeros,
9829 use_separator ? thousands_sep : NULL,
9830 thousands_sep_len, maxchar);
9831 }
9832 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009833}
9834
9835
Alexander Belopolsky40018472011-02-26 01:02:56 +00009836Py_ssize_t
9837PyUnicode_Count(PyObject *str,
9838 PyObject *substr,
9839 Py_ssize_t start,
9840 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009841{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009842 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009843 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009844 const void *buf1 = NULL, *buf2 = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009845 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009846
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009847 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009848 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009849
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009850 kind1 = PyUnicode_KIND(str);
9851 kind2 = PyUnicode_KIND(substr);
9852 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009853 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009854
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009855 len1 = PyUnicode_GET_LENGTH(str);
9856 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009857 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009858 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009859 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009860
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009861 buf1 = PyUnicode_DATA(str);
9862 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009863 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009864 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009865 if (!buf2)
9866 goto onError;
9867 }
9868
9869 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009870 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009871 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009872 result = asciilib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009873 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009874 buf2, len2, PY_SSIZE_T_MAX
9875 );
9876 else
9877 result = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009878 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009879 buf2, len2, PY_SSIZE_T_MAX
9880 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009881 break;
9882 case PyUnicode_2BYTE_KIND:
9883 result = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009884 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009885 buf2, len2, PY_SSIZE_T_MAX
9886 );
9887 break;
9888 case PyUnicode_4BYTE_KIND:
9889 result = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009890 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009891 buf2, len2, PY_SSIZE_T_MAX
9892 );
9893 break;
9894 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009895 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009896 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009897
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009898 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009899 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009900 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009901
Guido van Rossumd57fd912000-03-10 22:53:23 +00009902 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009903 onError:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009904 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9905 if (kind2 != kind1)
9906 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009907 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009908}
9909
Alexander Belopolsky40018472011-02-26 01:02:56 +00009910Py_ssize_t
9911PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009912 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009913 Py_ssize_t start,
9914 Py_ssize_t end,
9915 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009916{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009917 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009918 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009919
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009920 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009921}
9922
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009923Py_ssize_t
9924PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9925 Py_ssize_t start, Py_ssize_t end,
9926 int direction)
9927{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009928 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009929 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009930 if (PyUnicode_READY(str) == -1)
9931 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009932 len = PyUnicode_GET_LENGTH(str);
9933 ADJUST_INDICES(start, end, len);
9934 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009935 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009936 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009937 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9938 kind, end-start, ch, direction);
9939 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009940 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009941 else
9942 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009943}
9944
Alexander Belopolsky40018472011-02-26 01:02:56 +00009945static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009946tailmatch(PyObject *self,
9947 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009948 Py_ssize_t start,
9949 Py_ssize_t end,
9950 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009951{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009952 int kind_self;
9953 int kind_sub;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009954 const void *data_self;
9955 const void *data_sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009956 Py_ssize_t offset;
9957 Py_ssize_t i;
9958 Py_ssize_t end_sub;
9959
9960 if (PyUnicode_READY(self) == -1 ||
9961 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009962 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009963
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009964 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9965 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009966 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009967 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009968
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009969 if (PyUnicode_GET_LENGTH(substring) == 0)
9970 return 1;
9971
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009972 kind_self = PyUnicode_KIND(self);
9973 data_self = PyUnicode_DATA(self);
9974 kind_sub = PyUnicode_KIND(substring);
9975 data_sub = PyUnicode_DATA(substring);
9976 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9977
9978 if (direction > 0)
9979 offset = end;
9980 else
9981 offset = start;
9982
9983 if (PyUnicode_READ(kind_self, data_self, offset) ==
9984 PyUnicode_READ(kind_sub, data_sub, 0) &&
9985 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9986 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9987 /* If both are of the same kind, memcmp is sufficient */
9988 if (kind_self == kind_sub) {
9989 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009990 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009991 data_sub,
9992 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009993 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009994 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009995 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009996 else {
9997 /* We do not need to compare 0 and len(substring)-1 because
9998 the if statement above ensured already that they are equal
9999 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010000 for (i = 1; i < end_sub; ++i) {
10001 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
10002 PyUnicode_READ(kind_sub, data_sub, i))
10003 return 0;
10004 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010005 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010006 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010007 }
10008
10009 return 0;
10010}
10011
Alexander Belopolsky40018472011-02-26 01:02:56 +000010012Py_ssize_t
10013PyUnicode_Tailmatch(PyObject *str,
10014 PyObject *substr,
10015 Py_ssize_t start,
10016 Py_ssize_t end,
10017 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010018{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010019 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010020 return -1;
Tim Petersced69f82003-09-16 20:30:58 +000010021
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010022 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010023}
10024
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010025static PyObject *
10026ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010027{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010028 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010029 const char *data = PyUnicode_DATA(self);
10030 char *resdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010031 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +000010032
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010033 res = PyUnicode_New(len, 127);
10034 if (res == NULL)
10035 return NULL;
10036 resdata = PyUnicode_DATA(res);
10037 if (lower)
10038 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010039 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010040 _Py_bytes_upper(resdata, data, len);
10041 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010042}
10043
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010044static Py_UCS4
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010045handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010046{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010047 Py_ssize_t j;
10048 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010010049 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010050 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +000010051
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010052 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
10053
10054 where ! is a negation and \p{xxx} is a character with property xxx.
10055 */
10056 for (j = i - 1; j >= 0; j--) {
10057 c = PyUnicode_READ(kind, data, j);
10058 if (!_PyUnicode_IsCaseIgnorable(c))
10059 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010060 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010061 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
10062 if (final_sigma) {
10063 for (j = i + 1; j < length; j++) {
10064 c = PyUnicode_READ(kind, data, j);
10065 if (!_PyUnicode_IsCaseIgnorable(c))
10066 break;
10067 }
10068 final_sigma = j == length || !_PyUnicode_IsCased(c);
10069 }
10070 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010071}
10072
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010073static int
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010074lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010075 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010076{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010077 /* Obscure special case. */
10078 if (c == 0x3A3) {
10079 mapped[0] = handle_capital_sigma(kind, data, length, i);
10080 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010081 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010082 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010083}
10084
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010085static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010086do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010087{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010088 Py_ssize_t i, k = 0;
10089 int n_res, j;
10090 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +000010091
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010092 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +010010093 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010094 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010095 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010096 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +000010097 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010098 for (i = 1; i < length; i++) {
10099 c = PyUnicode_READ(kind, data, i);
10100 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10101 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010102 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010103 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000010104 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000010105 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010106 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010107}
10108
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010109static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010110do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010111 Py_ssize_t i, k = 0;
10112
10113 for (i = 0; i < length; i++) {
10114 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10115 int n_res, j;
10116 if (Py_UNICODE_ISUPPER(c)) {
10117 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10118 }
10119 else if (Py_UNICODE_ISLOWER(c)) {
10120 n_res = _PyUnicode_ToUpperFull(c, mapped);
10121 }
10122 else {
10123 n_res = 1;
10124 mapped[0] = c;
10125 }
10126 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010127 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010128 res[k++] = mapped[j];
10129 }
10130 }
10131 return k;
10132}
10133
10134static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010135do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010136 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010137{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010138 Py_ssize_t i, k = 0;
10139
10140 for (i = 0; i < length; i++) {
10141 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10142 int n_res, j;
10143 if (lower)
10144 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10145 else
10146 n_res = _PyUnicode_ToUpperFull(c, mapped);
10147 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010148 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010149 res[k++] = mapped[j];
10150 }
10151 }
10152 return k;
10153}
10154
10155static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010156do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010157{
10158 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
10159}
10160
10161static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010162do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010163{
10164 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
10165}
10166
Benjamin Petersone51757f2012-01-12 21:10:29 -050010167static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010168do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersond5890c82012-01-14 13:23:30 -050010169{
10170 Py_ssize_t i, k = 0;
10171
10172 for (i = 0; i < length; i++) {
10173 Py_UCS4 c = PyUnicode_READ(kind, data, i);
10174 Py_UCS4 mapped[3];
10175 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10176 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010177 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010178 res[k++] = mapped[j];
10179 }
10180 }
10181 return k;
10182}
10183
10184static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010185do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersone51757f2012-01-12 21:10:29 -050010186{
10187 Py_ssize_t i, k = 0;
10188 int previous_is_cased;
10189
10190 previous_is_cased = 0;
10191 for (i = 0; i < length; i++) {
10192 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10193 Py_UCS4 mapped[3];
10194 int n_res, j;
10195
10196 if (previous_is_cased)
10197 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10198 else
10199 n_res = _PyUnicode_ToTitleFull(c, mapped);
10200
10201 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010202 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -050010203 res[k++] = mapped[j];
10204 }
10205
10206 previous_is_cased = _PyUnicode_IsCased(c);
10207 }
10208 return k;
10209}
10210
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010211static PyObject *
10212case_operation(PyObject *self,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010213 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010214{
10215 PyObject *res = NULL;
10216 Py_ssize_t length, newlength = 0;
10217 int kind, outkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010218 const void *data;
10219 void *outdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010220 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10221
Benjamin Petersoneea48462012-01-16 14:28:50 -050010222 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010223
10224 kind = PyUnicode_KIND(self);
10225 data = PyUnicode_DATA(self);
10226 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010227 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010228 PyErr_SetString(PyExc_OverflowError, "string is too long");
10229 return NULL;
10230 }
Victor Stinner00d7abd2020-12-01 09:56:42 +010010231 tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010232 if (tmp == NULL)
10233 return PyErr_NoMemory();
10234 newlength = perform(kind, data, length, tmp, &maxchar);
10235 res = PyUnicode_New(newlength, maxchar);
10236 if (res == NULL)
10237 goto leave;
10238 tmpend = tmp + newlength;
10239 outdata = PyUnicode_DATA(res);
10240 outkind = PyUnicode_KIND(res);
10241 switch (outkind) {
10242 case PyUnicode_1BYTE_KIND:
10243 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10244 break;
10245 case PyUnicode_2BYTE_KIND:
10246 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10247 break;
10248 case PyUnicode_4BYTE_KIND:
10249 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10250 break;
10251 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010252 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010253 }
10254 leave:
Victor Stinner00d7abd2020-12-01 09:56:42 +010010255 PyMem_Free(tmp);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010256 return res;
10257}
10258
Tim Peters8ce9f162004-08-27 01:49:32 +000010259PyObject *
10260PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010261{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010262 PyObject *res;
10263 PyObject *fseq;
10264 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010265 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010266
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010267 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010268 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010269 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010270 }
10271
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010272 /* NOTE: the following code can't call back into Python code,
10273 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010274 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010275
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010276 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010277 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010278 res = _PyUnicode_JoinArray(separator, items, seqlen);
10279 Py_DECREF(fseq);
10280 return res;
10281}
10282
10283PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010284_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010285{
10286 PyObject *res = NULL; /* the result */
10287 PyObject *sep = NULL;
10288 Py_ssize_t seplen;
10289 PyObject *item;
10290 Py_ssize_t sz, i, res_offset;
10291 Py_UCS4 maxchar;
10292 Py_UCS4 item_maxchar;
10293 int use_memcpy;
10294 unsigned char *res_data = NULL, *sep_data = NULL;
10295 PyObject *last_obj;
10296 unsigned int kind = 0;
10297
Tim Peters05eba1f2004-08-27 21:32:02 +000010298 /* If empty sequence, return u"". */
10299 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010300 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010301 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010302
Tim Peters05eba1f2004-08-27 21:32:02 +000010303 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010304 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010305 if (seqlen == 1) {
10306 if (PyUnicode_CheckExact(items[0])) {
10307 res = items[0];
10308 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010309 return res;
10310 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010311 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010312 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010313 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010314 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010315 /* Set up sep and seplen */
10316 if (separator == NULL) {
10317 /* fall back to a blank space separator */
10318 sep = PyUnicode_FromOrdinal(' ');
10319 if (!sep)
10320 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010321 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010322 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010323 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010324 else {
10325 if (!PyUnicode_Check(separator)) {
10326 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010327 "separator: expected str instance,"
10328 " %.80s found",
10329 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010330 goto onError;
10331 }
10332 if (PyUnicode_READY(separator))
10333 goto onError;
10334 sep = separator;
10335 seplen = PyUnicode_GET_LENGTH(separator);
10336 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10337 /* inc refcount to keep this code path symmetric with the
10338 above case of a blank separator */
10339 Py_INCREF(sep);
10340 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010341 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010342 }
10343
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010344 /* There are at least two things to join, or else we have a subclass
10345 * of str in the sequence.
10346 * Do a pre-pass to figure out the total amount of space we'll
10347 * need (sz), and see whether all argument are strings.
10348 */
10349 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010350#ifdef Py_DEBUG
10351 use_memcpy = 0;
10352#else
10353 use_memcpy = 1;
10354#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010355 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010356 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010357 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010358 if (!PyUnicode_Check(item)) {
10359 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010360 "sequence item %zd: expected str instance,"
10361 " %.80s found",
10362 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010363 goto onError;
10364 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010365 if (PyUnicode_READY(item) == -1)
10366 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010367 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010369 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010370 if (i != 0) {
10371 add_sz += seplen;
10372 }
10373 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010374 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010375 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010376 goto onError;
10377 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010378 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010379 if (use_memcpy && last_obj != NULL) {
10380 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10381 use_memcpy = 0;
10382 }
10383 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010384 }
Tim Petersced69f82003-09-16 20:30:58 +000010385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010386 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010387 if (res == NULL)
10388 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010389
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010390 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010391#ifdef Py_DEBUG
10392 use_memcpy = 0;
10393#else
10394 if (use_memcpy) {
10395 res_data = PyUnicode_1BYTE_DATA(res);
10396 kind = PyUnicode_KIND(res);
10397 if (seplen != 0)
10398 sep_data = PyUnicode_1BYTE_DATA(sep);
10399 }
10400#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010401 if (use_memcpy) {
10402 for (i = 0; i < seqlen; ++i) {
10403 Py_ssize_t itemlen;
10404 item = items[i];
10405
10406 /* Copy item, and maybe the separator. */
10407 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010408 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010409 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010410 kind * seplen);
10411 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010412 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010413
10414 itemlen = PyUnicode_GET_LENGTH(item);
10415 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010416 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010417 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010418 kind * itemlen);
10419 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010420 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010421 }
10422 assert(res_data == PyUnicode_1BYTE_DATA(res)
10423 + kind * PyUnicode_GET_LENGTH(res));
10424 }
10425 else {
10426 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10427 Py_ssize_t itemlen;
10428 item = items[i];
10429
10430 /* Copy item, and maybe the separator. */
10431 if (i && seplen != 0) {
10432 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10433 res_offset += seplen;
10434 }
10435
10436 itemlen = PyUnicode_GET_LENGTH(item);
10437 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010438 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010439 res_offset += itemlen;
10440 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010441 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010442 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010443 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010444
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010445 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010446 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010447 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010448
Benjamin Peterson29060642009-01-31 22:14:21 +000010449 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010451 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010452 return NULL;
10453}
10454
Victor Stinnerd3f08822012-05-29 12:57:52 +020010455void
10456_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10457 Py_UCS4 fill_char)
10458{
10459 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010460 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010461 assert(PyUnicode_IS_READY(unicode));
10462 assert(unicode_modifiable(unicode));
10463 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10464 assert(start >= 0);
10465 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010466 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010467}
10468
Victor Stinner3fe55312012-01-04 00:33:50 +010010469Py_ssize_t
10470PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10471 Py_UCS4 fill_char)
10472{
10473 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010474
10475 if (!PyUnicode_Check(unicode)) {
10476 PyErr_BadInternalCall();
10477 return -1;
10478 }
10479 if (PyUnicode_READY(unicode) == -1)
10480 return -1;
10481 if (unicode_check_modifiable(unicode))
10482 return -1;
10483
Victor Stinnerd3f08822012-05-29 12:57:52 +020010484 if (start < 0) {
10485 PyErr_SetString(PyExc_IndexError, "string index out of range");
10486 return -1;
10487 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010488 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10489 PyErr_SetString(PyExc_ValueError,
10490 "fill character is bigger than "
10491 "the string maximum character");
10492 return -1;
10493 }
10494
10495 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10496 length = Py_MIN(maxlen, length);
10497 if (length <= 0)
10498 return 0;
10499
Victor Stinnerd3f08822012-05-29 12:57:52 +020010500 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010501 return length;
10502}
10503
Victor Stinner9310abb2011-10-05 00:59:23 +020010504static PyObject *
10505pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010506 Py_ssize_t left,
10507 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010509{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510 PyObject *u;
10511 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010512 int kind;
10513 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010514
10515 if (left < 0)
10516 left = 0;
10517 if (right < 0)
10518 right = 0;
10519
Victor Stinnerc4b49542011-12-11 22:44:26 +010010520 if (left == 0 && right == 0)
10521 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010522
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010523 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10524 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010525 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10526 return NULL;
10527 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010529 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010530 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010531 if (!u)
10532 return NULL;
10533
10534 kind = PyUnicode_KIND(u);
10535 data = PyUnicode_DATA(u);
10536 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010537 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010538 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010539 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010540 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010541 assert(_PyUnicode_CheckConsistency(u, 1));
10542 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010543}
10544
Alexander Belopolsky40018472011-02-26 01:02:56 +000010545PyObject *
10546PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010547{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010548 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010549
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010550 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010551 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010552
Benjamin Petersonead6b532011-12-20 17:23:42 -060010553 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010554 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010555 if (PyUnicode_IS_ASCII(string))
10556 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010557 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010558 PyUnicode_GET_LENGTH(string), keepends);
10559 else
10560 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010561 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010562 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010563 break;
10564 case PyUnicode_2BYTE_KIND:
10565 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010566 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010567 PyUnicode_GET_LENGTH(string), keepends);
10568 break;
10569 case PyUnicode_4BYTE_KIND:
10570 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010571 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 PyUnicode_GET_LENGTH(string), keepends);
10573 break;
10574 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010575 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010577 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010578}
10579
Alexander Belopolsky40018472011-02-26 01:02:56 +000010580static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010581split(PyObject *self,
10582 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010583 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010584{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010585 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010586 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 Py_ssize_t len1, len2;
10588 PyObject* out;
10589
Guido van Rossumd57fd912000-03-10 22:53:23 +000010590 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010591 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010592
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 if (PyUnicode_READY(self) == -1)
10594 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010595
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010597 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010599 if (PyUnicode_IS_ASCII(self))
10600 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010601 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010602 PyUnicode_GET_LENGTH(self), maxcount
10603 );
10604 else
10605 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010606 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010607 PyUnicode_GET_LENGTH(self), maxcount
10608 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 case PyUnicode_2BYTE_KIND:
10610 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010611 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 PyUnicode_GET_LENGTH(self), maxcount
10613 );
10614 case PyUnicode_4BYTE_KIND:
10615 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010616 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 PyUnicode_GET_LENGTH(self), maxcount
10618 );
10619 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010620 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 }
10622
10623 if (PyUnicode_READY(substring) == -1)
10624 return NULL;
10625
10626 kind1 = PyUnicode_KIND(self);
10627 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628 len1 = PyUnicode_GET_LENGTH(self);
10629 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010630 if (kind1 < kind2 || len1 < len2) {
10631 out = PyList_New(1);
10632 if (out == NULL)
10633 return NULL;
10634 Py_INCREF(self);
10635 PyList_SET_ITEM(out, 0, self);
10636 return out;
10637 }
10638 buf1 = PyUnicode_DATA(self);
10639 buf2 = PyUnicode_DATA(substring);
10640 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010641 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010642 if (!buf2)
10643 return NULL;
10644 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010645
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010646 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010647 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010648 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10649 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010650 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010651 else
10652 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010653 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 break;
10655 case PyUnicode_2BYTE_KIND:
10656 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010657 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010658 break;
10659 case PyUnicode_4BYTE_KIND:
10660 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010661 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010662 break;
10663 default:
10664 out = NULL;
10665 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010666 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010667 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010668 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010669 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010670}
10671
Alexander Belopolsky40018472011-02-26 01:02:56 +000010672static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010673rsplit(PyObject *self,
10674 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010675 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010676{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010677 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010678 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010679 Py_ssize_t len1, len2;
10680 PyObject* out;
10681
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010682 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010683 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010684
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685 if (PyUnicode_READY(self) == -1)
10686 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010687
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010688 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010689 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010691 if (PyUnicode_IS_ASCII(self))
10692 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010693 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010694 PyUnicode_GET_LENGTH(self), maxcount
10695 );
10696 else
10697 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010698 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010699 PyUnicode_GET_LENGTH(self), maxcount
10700 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010701 case PyUnicode_2BYTE_KIND:
10702 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010703 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010704 PyUnicode_GET_LENGTH(self), maxcount
10705 );
10706 case PyUnicode_4BYTE_KIND:
10707 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010708 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010709 PyUnicode_GET_LENGTH(self), maxcount
10710 );
10711 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010712 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010713 }
10714
10715 if (PyUnicode_READY(substring) == -1)
10716 return NULL;
10717
10718 kind1 = PyUnicode_KIND(self);
10719 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010720 len1 = PyUnicode_GET_LENGTH(self);
10721 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010722 if (kind1 < kind2 || len1 < len2) {
10723 out = PyList_New(1);
10724 if (out == NULL)
10725 return NULL;
10726 Py_INCREF(self);
10727 PyList_SET_ITEM(out, 0, self);
10728 return out;
10729 }
10730 buf1 = PyUnicode_DATA(self);
10731 buf2 = PyUnicode_DATA(substring);
10732 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010733 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010734 if (!buf2)
10735 return NULL;
10736 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010737
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010738 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010739 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010740 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10741 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010742 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010743 else
10744 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010745 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010746 break;
10747 case PyUnicode_2BYTE_KIND:
10748 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010749 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010750 break;
10751 case PyUnicode_4BYTE_KIND:
10752 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010753 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010754 break;
10755 default:
10756 out = NULL;
10757 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010758 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010759 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010760 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010761 return out;
10762}
10763
10764static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010765anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10766 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010767{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010768 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010769 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010770 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10771 return asciilib_find(buf1, len1, buf2, len2, offset);
10772 else
10773 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010774 case PyUnicode_2BYTE_KIND:
10775 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10776 case PyUnicode_4BYTE_KIND:
10777 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10778 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010779 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010780}
10781
10782static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010783anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10784 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010785{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010786 switch (kind) {
10787 case PyUnicode_1BYTE_KIND:
10788 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10789 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10790 else
10791 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10792 case PyUnicode_2BYTE_KIND:
10793 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10794 case PyUnicode_4BYTE_KIND:
10795 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10796 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010797 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010798}
10799
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010800static void
10801replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10802 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10803{
10804 int kind = PyUnicode_KIND(u);
10805 void *data = PyUnicode_DATA(u);
10806 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10807 if (kind == PyUnicode_1BYTE_KIND) {
10808 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10809 (Py_UCS1 *)data + len,
10810 u1, u2, maxcount);
10811 }
10812 else if (kind == PyUnicode_2BYTE_KIND) {
10813 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10814 (Py_UCS2 *)data + len,
10815 u1, u2, maxcount);
10816 }
10817 else {
10818 assert(kind == PyUnicode_4BYTE_KIND);
10819 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10820 (Py_UCS4 *)data + len,
10821 u1, u2, maxcount);
10822 }
10823}
10824
Alexander Belopolsky40018472011-02-26 01:02:56 +000010825static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010826replace(PyObject *self, PyObject *str1,
10827 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010828{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010829 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010830 const char *sbuf = PyUnicode_DATA(self);
10831 const void *buf1 = PyUnicode_DATA(str1);
10832 const void *buf2 = PyUnicode_DATA(str2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010833 int srelease = 0, release1 = 0, release2 = 0;
10834 int skind = PyUnicode_KIND(self);
10835 int kind1 = PyUnicode_KIND(str1);
10836 int kind2 = PyUnicode_KIND(str2);
10837 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10838 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10839 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010840 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010841 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010842
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010843 if (slen < len1)
10844 goto nothing;
10845
Guido van Rossumd57fd912000-03-10 22:53:23 +000010846 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010847 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010848 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010849 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010850
Victor Stinner59de0ee2011-10-07 10:01:28 +020010851 if (str1 == str2)
10852 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010853
Victor Stinner49a0a212011-10-12 23:46:10 +020010854 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010855 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10856 if (maxchar < maxchar_str1)
10857 /* substring too wide to be present */
10858 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010859 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10860 /* Replacing str1 with str2 may cause a maxchar reduction in the
10861 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010862 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010863 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010864
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010865 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010866 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010867 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010868 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010869 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010870 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010871 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010872 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010873
Victor Stinner69ed0f42013-04-09 21:48:24 +020010874 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010875 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010876 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010877 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010878 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010879 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010880 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010881 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010882
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010883 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10884 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010885 }
10886 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010887 int rkind = skind;
10888 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010889 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010890
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010891 if (kind1 < rkind) {
10892 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010893 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010894 if (!buf1) goto error;
10895 release1 = 1;
10896 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010897 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010898 if (i < 0)
10899 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010900 if (rkind > kind2) {
10901 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010902 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010903 if (!buf2) goto error;
10904 release2 = 1;
10905 }
10906 else if (rkind < kind2) {
10907 /* widen self and buf1 */
10908 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010909 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010910 assert(buf1 != PyUnicode_DATA(str1));
10911 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010912 buf1 = PyUnicode_DATA(str1);
10913 release1 = 0;
10914 }
10915 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010916 if (!sbuf) goto error;
10917 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010918 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010919 if (!buf1) goto error;
10920 release1 = 1;
10921 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010922 u = PyUnicode_New(slen, maxchar);
10923 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010924 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010925 assert(PyUnicode_KIND(u) == rkind);
10926 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010927
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010928 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010929 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010930 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010931 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010932 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010933 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010934
10935 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010936 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010937 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010938 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010939 if (i == -1)
10940 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010941 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010942 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010943 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010944 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010945 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010946 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010947 }
10948 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010949 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010950 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010951 int rkind = skind;
10952 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010954 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010955 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010956 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010957 if (!buf1) goto error;
10958 release1 = 1;
10959 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010960 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010961 if (n == 0)
10962 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010963 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010964 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010965 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010966 if (!buf2) goto error;
10967 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010968 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010969 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010970 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010971 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010972 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010973 if (!sbuf) goto error;
10974 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010975 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010976 assert(buf1 != PyUnicode_DATA(str1));
10977 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010978 buf1 = PyUnicode_DATA(str1);
10979 release1 = 0;
10980 }
10981 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010982 if (!buf1) goto error;
10983 release1 = 1;
10984 }
10985 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10986 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010987 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010988 PyErr_SetString(PyExc_OverflowError,
10989 "replace string is too long");
10990 goto error;
10991 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010992 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010993 if (new_size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +020010994 u = unicode_new_empty();
Victor Stinner49a0a212011-10-12 23:46:10 +020010995 goto done;
10996 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010997 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010998 PyErr_SetString(PyExc_OverflowError,
10999 "replace string is too long");
11000 goto error;
11001 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011002 u = PyUnicode_New(new_size, maxchar);
11003 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011004 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020011005 assert(PyUnicode_KIND(u) == rkind);
11006 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011007 ires = i = 0;
11008 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011009 while (n-- > 0) {
11010 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020011011 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011012 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020011013 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000011014 if (j == -1)
11015 break;
11016 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011017 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011018 memcpy(res + rkind * ires,
11019 sbuf + rkind * i,
11020 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011021 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011022 }
11023 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011024 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011025 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011026 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011027 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011028 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011029 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011030 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011031 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011032 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011033 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011034 memcpy(res + rkind * ires,
11035 sbuf + rkind * i,
11036 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020011037 }
11038 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011039 /* interleave */
11040 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011041 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011042 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011043 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011044 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011045 if (--n <= 0)
11046 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011047 memcpy(res + rkind * ires,
11048 sbuf + rkind * i,
11049 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011050 ires++;
11051 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011052 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011053 memcpy(res + rkind * ires,
11054 sbuf + rkind * i,
11055 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011056 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011057 }
11058
11059 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020011060 unicode_adjust_maxchar(&u);
11061 if (u == NULL)
11062 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011063 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011064
11065 done:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011066 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11067 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11068 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011069 if (srelease)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011070 PyMem_Free((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011071 if (release1)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011072 PyMem_Free((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011073 if (release2)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011074 PyMem_Free((void *)buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011075 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011076 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011077
Benjamin Peterson29060642009-01-31 22:14:21 +000011078 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000011079 /* nothing to replace; return original string (when possible) */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011080 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11081 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11082 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011083 if (srelease)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011084 PyMem_Free((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011085 if (release1)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011086 PyMem_Free((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011087 if (release2)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011088 PyMem_Free((void *)buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010011089 return unicode_result_unchanged(self);
11090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011091 error:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011092 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11093 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11094 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11095 if (srelease)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011096 PyMem_Free((void *)sbuf);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011097 if (release1)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011098 PyMem_Free((void *)buf1);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011099 if (release2)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011100 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011101 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011102}
11103
11104/* --- Unicode Object Methods --------------------------------------------- */
11105
INADA Naoki3ae20562017-01-16 20:41:20 +090011106/*[clinic input]
11107str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000011108
INADA Naoki3ae20562017-01-16 20:41:20 +090011109Return a version of the string where each word is titlecased.
11110
11111More specifically, words start with uppercased characters and all remaining
11112cased characters have lower case.
11113[clinic start generated code]*/
11114
11115static PyObject *
11116unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011117/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011118{
Benjamin Petersoneea48462012-01-16 14:28:50 -050011119 if (PyUnicode_READY(self) == -1)
11120 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011121 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011122}
11123
INADA Naoki3ae20562017-01-16 20:41:20 +090011124/*[clinic input]
11125str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000011126
INADA Naoki3ae20562017-01-16 20:41:20 +090011127Return a capitalized version of the string.
11128
11129More specifically, make the first character have upper case and the rest lower
11130case.
11131[clinic start generated code]*/
11132
11133static PyObject *
11134unicode_capitalize_impl(PyObject *self)
11135/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011136{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011137 if (PyUnicode_READY(self) == -1)
11138 return NULL;
11139 if (PyUnicode_GET_LENGTH(self) == 0)
11140 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011141 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011142}
11143
INADA Naoki3ae20562017-01-16 20:41:20 +090011144/*[clinic input]
11145str.casefold as unicode_casefold
11146
11147Return a version of the string suitable for caseless comparisons.
11148[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011149
11150static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011151unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011152/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011153{
11154 if (PyUnicode_READY(self) == -1)
11155 return NULL;
11156 if (PyUnicode_IS_ASCII(self))
11157 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011158 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050011159}
11160
11161
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011162/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011163
11164static int
11165convert_uc(PyObject *obj, void *addr)
11166{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011167 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011168
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011169 if (!PyUnicode_Check(obj)) {
11170 PyErr_Format(PyExc_TypeError,
11171 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020011172 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011173 return 0;
11174 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011175 if (PyUnicode_READY(obj) < 0)
11176 return 0;
11177 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011178 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011179 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000011180 return 0;
11181 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011182 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011183 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011184}
11185
INADA Naoki3ae20562017-01-16 20:41:20 +090011186/*[clinic input]
11187str.center as unicode_center
11188
11189 width: Py_ssize_t
11190 fillchar: Py_UCS4 = ' '
11191 /
11192
11193Return a centered string of length width.
11194
11195Padding is done using the specified fill character (default is a space).
11196[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011197
11198static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011199unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11200/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011201{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011202 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011203
Benjamin Petersonbac79492012-01-14 13:34:47 -050011204 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011205 return NULL;
11206
Victor Stinnerc4b49542011-12-11 22:44:26 +010011207 if (PyUnicode_GET_LENGTH(self) >= width)
11208 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011209
Victor Stinnerc4b49542011-12-11 22:44:26 +010011210 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011211 left = marg / 2 + (marg & width & 1);
11212
Victor Stinner9310abb2011-10-05 00:59:23 +020011213 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011214}
11215
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011216/* This function assumes that str1 and str2 are readied by the caller. */
11217
Marc-André Lemburge5034372000-08-08 08:04:29 +000011218static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011219unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000011220{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011221#define COMPARE(TYPE1, TYPE2) \
11222 do { \
11223 TYPE1* p1 = (TYPE1 *)data1; \
11224 TYPE2* p2 = (TYPE2 *)data2; \
11225 TYPE1* end = p1 + len; \
11226 Py_UCS4 c1, c2; \
11227 for (; p1 != end; p1++, p2++) { \
11228 c1 = *p1; \
11229 c2 = *p2; \
11230 if (c1 != c2) \
11231 return (c1 < c2) ? -1 : 1; \
11232 } \
11233 } \
11234 while (0)
11235
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011236 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011237 const void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011238 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011239
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011240 kind1 = PyUnicode_KIND(str1);
11241 kind2 = PyUnicode_KIND(str2);
11242 data1 = PyUnicode_DATA(str1);
11243 data2 = PyUnicode_DATA(str2);
11244 len1 = PyUnicode_GET_LENGTH(str1);
11245 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011246 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011247
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011248 switch(kind1) {
11249 case PyUnicode_1BYTE_KIND:
11250 {
11251 switch(kind2) {
11252 case PyUnicode_1BYTE_KIND:
11253 {
11254 int cmp = memcmp(data1, data2, len);
11255 /* normalize result of memcmp() into the range [-1; 1] */
11256 if (cmp < 0)
11257 return -1;
11258 if (cmp > 0)
11259 return 1;
11260 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011261 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011262 case PyUnicode_2BYTE_KIND:
11263 COMPARE(Py_UCS1, Py_UCS2);
11264 break;
11265 case PyUnicode_4BYTE_KIND:
11266 COMPARE(Py_UCS1, Py_UCS4);
11267 break;
11268 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011269 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011270 }
11271 break;
11272 }
11273 case PyUnicode_2BYTE_KIND:
11274 {
11275 switch(kind2) {
11276 case PyUnicode_1BYTE_KIND:
11277 COMPARE(Py_UCS2, Py_UCS1);
11278 break;
11279 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011280 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011281 COMPARE(Py_UCS2, Py_UCS2);
11282 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011283 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011284 case PyUnicode_4BYTE_KIND:
11285 COMPARE(Py_UCS2, Py_UCS4);
11286 break;
11287 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011288 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011289 }
11290 break;
11291 }
11292 case PyUnicode_4BYTE_KIND:
11293 {
11294 switch(kind2) {
11295 case PyUnicode_1BYTE_KIND:
11296 COMPARE(Py_UCS4, Py_UCS1);
11297 break;
11298 case PyUnicode_2BYTE_KIND:
11299 COMPARE(Py_UCS4, Py_UCS2);
11300 break;
11301 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011302 {
11303#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11304 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11305 /* normalize result of wmemcmp() into the range [-1; 1] */
11306 if (cmp < 0)
11307 return -1;
11308 if (cmp > 0)
11309 return 1;
11310#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011311 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011312#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011313 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011314 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011315 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011316 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011317 }
11318 break;
11319 }
11320 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011321 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011322 }
11323
Victor Stinner770e19e2012-10-04 22:59:45 +020011324 if (len1 == len2)
11325 return 0;
11326 if (len1 < len2)
11327 return -1;
11328 else
11329 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011330
11331#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011332}
11333
Benjamin Peterson621b4302016-09-09 13:54:34 -070011334static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011335unicode_compare_eq(PyObject *str1, PyObject *str2)
11336{
11337 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011338 const void *data1, *data2;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011339 Py_ssize_t len;
11340 int cmp;
11341
Victor Stinnere5567ad2012-10-23 02:48:49 +020011342 len = PyUnicode_GET_LENGTH(str1);
11343 if (PyUnicode_GET_LENGTH(str2) != len)
11344 return 0;
11345 kind = PyUnicode_KIND(str1);
11346 if (PyUnicode_KIND(str2) != kind)
11347 return 0;
11348 data1 = PyUnicode_DATA(str1);
11349 data2 = PyUnicode_DATA(str2);
11350
11351 cmp = memcmp(data1, data2, len * kind);
11352 return (cmp == 0);
11353}
11354
11355
Alexander Belopolsky40018472011-02-26 01:02:56 +000011356int
11357PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011358{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011359 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11360 if (PyUnicode_READY(left) == -1 ||
11361 PyUnicode_READY(right) == -1)
11362 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011363
11364 /* a string is equal to itself */
11365 if (left == right)
11366 return 0;
11367
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011368 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011369 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011370 PyErr_Format(PyExc_TypeError,
11371 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011372 Py_TYPE(left)->tp_name,
11373 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011374 return -1;
11375}
11376
Martin v. Löwis5b222132007-06-10 09:51:05 +000011377int
11378PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11379{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011380 Py_ssize_t i;
11381 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011382 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011383 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011384
Victor Stinner910337b2011-10-03 03:20:16 +020011385 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011386 if (!PyUnicode_IS_READY(uni)) {
11387 const wchar_t *ws = _PyUnicode_WSTR(uni);
11388 /* Compare Unicode string and source character set string */
11389 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11390 if (chr != ustr[i])
11391 return (chr < ustr[i]) ? -1 : 1;
11392 }
11393 /* This check keeps Python strings that end in '\0' from comparing equal
11394 to C strings identical up to that point. */
11395 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11396 return 1; /* uni is longer */
11397 if (ustr[i])
11398 return -1; /* str is longer */
11399 return 0;
11400 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011401 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011402 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011403 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011404 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011405 size_t len, len2 = strlen(str);
11406 int cmp;
11407
11408 len = Py_MIN(len1, len2);
11409 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011410 if (cmp != 0) {
11411 if (cmp < 0)
11412 return -1;
11413 else
11414 return 1;
11415 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011416 if (len1 > len2)
11417 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011418 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011419 return -1; /* str is longer */
11420 return 0;
11421 }
11422 else {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011423 const void *data = PyUnicode_DATA(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011424 /* Compare Unicode string and source character set string */
11425 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011426 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011427 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11428 /* This check keeps Python strings that end in '\0' from comparing equal
11429 to C strings identical up to that point. */
11430 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11431 return 1; /* uni is longer */
11432 if (str[i])
11433 return -1; /* str is longer */
11434 return 0;
11435 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011436}
11437
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011438static int
11439non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11440{
11441 size_t i, len;
11442 const wchar_t *p;
11443 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11444 if (strlen(str) != len)
11445 return 0;
11446 p = _PyUnicode_WSTR(unicode);
11447 assert(p);
11448 for (i = 0; i < len; i++) {
11449 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011450 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011451 return 0;
11452 }
11453 return 1;
11454}
11455
11456int
11457_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11458{
11459 size_t len;
11460 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011461 assert(str);
11462#ifndef NDEBUG
11463 for (const char *p = str; *p; p++) {
11464 assert((unsigned char)*p < 128);
11465 }
11466#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011467 if (PyUnicode_READY(unicode) == -1) {
11468 /* Memory error or bad data */
11469 PyErr_Clear();
11470 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11471 }
11472 if (!PyUnicode_IS_ASCII(unicode))
11473 return 0;
11474 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11475 return strlen(str) == len &&
11476 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11477}
11478
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011479int
11480_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11481{
11482 PyObject *right_uni;
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011483
11484 assert(_PyUnicode_CHECK(left));
11485 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011486#ifndef NDEBUG
11487 for (const char *p = right->string; *p; p++) {
11488 assert((unsigned char)*p < 128);
11489 }
11490#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011491
11492 if (PyUnicode_READY(left) == -1) {
11493 /* memory error or bad data */
11494 PyErr_Clear();
11495 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11496 }
11497
11498 if (!PyUnicode_IS_ASCII(left))
11499 return 0;
11500
11501 right_uni = _PyUnicode_FromId(right); /* borrowed */
11502 if (right_uni == NULL) {
11503 /* memory error or bad data */
11504 PyErr_Clear();
11505 return _PyUnicode_EqualToASCIIString(left, right->string);
11506 }
11507
11508 if (left == right_uni)
11509 return 1;
11510
11511 if (PyUnicode_CHECK_INTERNED(left))
11512 return 0;
11513
INADA Naoki7cc95f52018-01-28 02:07:09 +090011514 assert(_PyUnicode_HASH(right_uni) != -1);
Victor Stinner607b1022020-05-05 18:50:30 +020011515 Py_hash_t hash = _PyUnicode_HASH(left);
Victor Stinnerea251802020-12-26 02:58:33 +010011516 if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) {
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011517 return 0;
Victor Stinnerea251802020-12-26 02:58:33 +010011518 }
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011519
11520 return unicode_compare_eq(left, right_uni);
11521}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011522
Alexander Belopolsky40018472011-02-26 01:02:56 +000011523PyObject *
11524PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011525{
11526 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011527
Victor Stinnere5567ad2012-10-23 02:48:49 +020011528 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11529 Py_RETURN_NOTIMPLEMENTED;
11530
11531 if (PyUnicode_READY(left) == -1 ||
11532 PyUnicode_READY(right) == -1)
11533 return NULL;
11534
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011535 if (left == right) {
11536 switch (op) {
11537 case Py_EQ:
11538 case Py_LE:
11539 case Py_GE:
11540 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011541 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011542 case Py_NE:
11543 case Py_LT:
11544 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011545 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011546 default:
11547 PyErr_BadArgument();
11548 return NULL;
11549 }
11550 }
11551 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011552 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011553 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011554 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011555 }
11556 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011557 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011558 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011559 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011560}
11561
Alexander Belopolsky40018472011-02-26 01:02:56 +000011562int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011563_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11564{
11565 return unicode_eq(aa, bb);
11566}
11567
11568int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011569PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011570{
Victor Stinner77282cb2013-04-14 19:22:47 +020011571 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011572 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011573 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011574 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011575
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011576 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011577 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011578 "'in <string>' requires string as left operand, not %.100s",
11579 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011580 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011581 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011582 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011583 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011584 if (ensure_unicode(str) < 0)
11585 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011586
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011587 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011588 kind2 = PyUnicode_KIND(substr);
11589 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011590 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011591 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011592 len2 = PyUnicode_GET_LENGTH(substr);
11593 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011594 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011595 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011596 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011597 if (len2 == 1) {
11598 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11599 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011600 return result;
11601 }
11602 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011603 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011604 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011605 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011606 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011607
Victor Stinner77282cb2013-04-14 19:22:47 +020011608 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011609 case PyUnicode_1BYTE_KIND:
11610 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11611 break;
11612 case PyUnicode_2BYTE_KIND:
11613 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11614 break;
11615 case PyUnicode_4BYTE_KIND:
11616 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11617 break;
11618 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011619 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011620 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011621
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011622 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
Victor Stinner77282cb2013-04-14 19:22:47 +020011623 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011624 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011625
Guido van Rossum403d68b2000-03-13 15:55:09 +000011626 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011627}
11628
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629/* Concat to string or Unicode object giving a new Unicode object. */
11630
Alexander Belopolsky40018472011-02-26 01:02:56 +000011631PyObject *
11632PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011633{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011634 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011635 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011636 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011637
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011638 if (ensure_unicode(left) < 0)
11639 return NULL;
11640
11641 if (!PyUnicode_Check(right)) {
11642 PyErr_Format(PyExc_TypeError,
11643 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011644 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011645 return NULL;
11646 }
11647 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011648 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011649
11650 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011651 PyObject *empty = unicode_get_empty(); // Borrowed reference
11652 if (left == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011653 return PyUnicode_FromObject(right);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011654 }
11655 if (right == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011656 return PyUnicode_FromObject(left);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011657 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011658
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011659 left_len = PyUnicode_GET_LENGTH(left);
11660 right_len = PyUnicode_GET_LENGTH(right);
11661 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011662 PyErr_SetString(PyExc_OverflowError,
11663 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011664 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011665 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011666 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011667
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011668 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11669 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011670 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011671
Guido van Rossumd57fd912000-03-10 22:53:23 +000011672 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011673 result = PyUnicode_New(new_len, maxchar);
11674 if (result == NULL)
11675 return NULL;
11676 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11677 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11678 assert(_PyUnicode_CheckConsistency(result, 1));
11679 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011680}
11681
Walter Dörwald1ab83302007-05-18 17:15:44 +000011682void
Victor Stinner23e56682011-10-03 03:54:37 +020011683PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011684{
Victor Stinner23e56682011-10-03 03:54:37 +020011685 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011686 Py_UCS4 maxchar, maxchar2;
11687 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011688
11689 if (p_left == NULL) {
11690 if (!PyErr_Occurred())
11691 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011692 return;
11693 }
Victor Stinner23e56682011-10-03 03:54:37 +020011694 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011695 if (right == NULL || left == NULL
11696 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011697 if (!PyErr_Occurred())
11698 PyErr_BadInternalCall();
11699 goto error;
11700 }
11701
Benjamin Petersonbac79492012-01-14 13:34:47 -050011702 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011703 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011704 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011705 goto error;
11706
Victor Stinner488fa492011-12-12 00:01:39 +010011707 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011708 PyObject *empty = unicode_get_empty(); // Borrowed reference
11709 if (left == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011710 Py_DECREF(left);
11711 Py_INCREF(right);
11712 *p_left = right;
11713 return;
11714 }
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011715 if (right == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011716 return;
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011717 }
Victor Stinner488fa492011-12-12 00:01:39 +010011718
11719 left_len = PyUnicode_GET_LENGTH(left);
11720 right_len = PyUnicode_GET_LENGTH(right);
11721 if (left_len > PY_SSIZE_T_MAX - right_len) {
11722 PyErr_SetString(PyExc_OverflowError,
11723 "strings are too large to concat");
11724 goto error;
11725 }
11726 new_len = left_len + right_len;
11727
11728 if (unicode_modifiable(left)
11729 && PyUnicode_CheckExact(right)
11730 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011731 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11732 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011733 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011734 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011735 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11736 {
11737 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011738 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011739 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011740
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011741 /* copy 'right' into the newly allocated area of 'left' */
11742 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011743 }
Victor Stinner488fa492011-12-12 00:01:39 +010011744 else {
11745 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11746 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011747 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011748
Victor Stinner488fa492011-12-12 00:01:39 +010011749 /* Concat the two Unicode strings */
11750 res = PyUnicode_New(new_len, maxchar);
11751 if (res == NULL)
11752 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011753 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11754 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011755 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011756 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011757 }
11758 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011759 return;
11760
11761error:
Victor Stinner488fa492011-12-12 00:01:39 +010011762 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011763}
11764
11765void
11766PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11767{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011768 PyUnicode_Append(pleft, right);
11769 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011770}
11771
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011772/*
11773Wraps stringlib_parse_args_finds() and additionally ensures that the
11774first argument is a unicode object.
11775*/
11776
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011777static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011778parse_args_finds_unicode(const char * function_name, PyObject *args,
11779 PyObject **substring,
11780 Py_ssize_t *start, Py_ssize_t *end)
11781{
11782 if(stringlib_parse_args_finds(function_name, args, substring,
11783 start, end)) {
11784 if (ensure_unicode(*substring) < 0)
11785 return 0;
11786 return 1;
11787 }
11788 return 0;
11789}
11790
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011791PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011792 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011794Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011795string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011796interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011797
11798static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011799unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011800{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011801 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011802 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011803 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011804 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011805 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011806 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011807 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011809 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011810 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011812 kind1 = PyUnicode_KIND(self);
11813 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011814 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011815 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011816
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011817 len1 = PyUnicode_GET_LENGTH(self);
11818 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011819 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011820 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011821 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011822
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011823 buf1 = PyUnicode_DATA(self);
11824 buf2 = PyUnicode_DATA(substring);
11825 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011826 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011827 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011828 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011829 }
11830 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011831 case PyUnicode_1BYTE_KIND:
11832 iresult = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011833 ((const Py_UCS1*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011834 buf2, len2, PY_SSIZE_T_MAX
11835 );
11836 break;
11837 case PyUnicode_2BYTE_KIND:
11838 iresult = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011839 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011840 buf2, len2, PY_SSIZE_T_MAX
11841 );
11842 break;
11843 case PyUnicode_4BYTE_KIND:
11844 iresult = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011845 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011846 buf2, len2, PY_SSIZE_T_MAX
11847 );
11848 break;
11849 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011850 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011851 }
11852
11853 result = PyLong_FromSsize_t(iresult);
11854
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011855 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011856 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011857 PyMem_Free((void *)buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011858
Guido van Rossumd57fd912000-03-10 22:53:23 +000011859 return result;
11860}
11861
INADA Naoki3ae20562017-01-16 20:41:20 +090011862/*[clinic input]
11863str.encode as unicode_encode
11864
11865 encoding: str(c_default="NULL") = 'utf-8'
11866 The encoding in which to encode the string.
11867 errors: str(c_default="NULL") = 'strict'
11868 The error handling scheme to use for encoding errors.
11869 The default is 'strict' meaning that encoding errors raise a
11870 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11871 'xmlcharrefreplace' as well as any other name registered with
11872 codecs.register_error that can handle UnicodeEncodeErrors.
11873
11874Encode the string using the codec registered for encoding.
11875[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011876
11877static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011878unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011879/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011880{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011881 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011882}
11883
INADA Naoki3ae20562017-01-16 20:41:20 +090011884/*[clinic input]
11885str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886
INADA Naoki3ae20562017-01-16 20:41:20 +090011887 tabsize: int = 8
11888
11889Return a copy where all tab characters are expanded using spaces.
11890
11891If tabsize is not given, a tab size of 8 characters is assumed.
11892[clinic start generated code]*/
11893
11894static PyObject *
11895unicode_expandtabs_impl(PyObject *self, int tabsize)
11896/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011897{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011898 Py_ssize_t i, j, line_pos, src_len, incr;
11899 Py_UCS4 ch;
11900 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011901 const void *src_data;
11902 void *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011903 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011904 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905
Antoine Pitrou22425222011-10-04 19:10:51 +020011906 if (PyUnicode_READY(self) == -1)
11907 return NULL;
11908
Thomas Wouters7e474022000-07-16 12:04:32 +000011909 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011910 src_len = PyUnicode_GET_LENGTH(self);
11911 i = j = line_pos = 0;
11912 kind = PyUnicode_KIND(self);
11913 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011914 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011915 for (; i < src_len; i++) {
11916 ch = PyUnicode_READ(kind, src_data, i);
11917 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011918 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011919 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011920 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011921 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011922 goto overflow;
11923 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011924 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011925 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011926 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011927 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011928 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011929 goto overflow;
11930 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011932 if (ch == '\n' || ch == '\r')
11933 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011935 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011936 if (!found)
11937 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011938
Guido van Rossumd57fd912000-03-10 22:53:23 +000011939 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011940 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011941 if (!u)
11942 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011943 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011944
Antoine Pitroue71d5742011-10-04 15:55:09 +020011945 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011946
Antoine Pitroue71d5742011-10-04 15:55:09 +020011947 for (; i < src_len; i++) {
11948 ch = PyUnicode_READ(kind, src_data, i);
11949 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011950 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011951 incr = tabsize - (line_pos % tabsize);
11952 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011953 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011954 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011955 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011956 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011957 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011958 line_pos++;
11959 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011960 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011961 if (ch == '\n' || ch == '\r')
11962 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011963 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011964 }
11965 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011966 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011967
Antoine Pitroue71d5742011-10-04 15:55:09 +020011968 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011969 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11970 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011971}
11972
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011973PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011974 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011975\n\
11976Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011977such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011978arguments start and end are interpreted as in slice notation.\n\
11979\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011980Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981
11982static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011983unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011984{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011985 /* initialize variables to prevent gcc warning */
11986 PyObject *substring = NULL;
11987 Py_ssize_t start = 0;
11988 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011989 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011990
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011991 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011992 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011993
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011994 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011995 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011997 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011998
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011999 if (result == -2)
12000 return NULL;
12001
Christian Heimes217cfd12007-12-02 14:31:20 +000012002 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012003}
12004
12005static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012006unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012007{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012008 const void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012009 enum PyUnicode_Kind kind;
12010 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012011
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030012012 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012013 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012014 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012015 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030012016 if (PyUnicode_READY(self) == -1) {
12017 return NULL;
12018 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012019 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
12020 PyErr_SetString(PyExc_IndexError, "string index out of range");
12021 return NULL;
12022 }
12023 kind = PyUnicode_KIND(self);
12024 data = PyUnicode_DATA(self);
12025 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010012026 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012027}
12028
Guido van Rossumc2504932007-09-18 19:42:40 +000012029/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010012030 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000012031static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012032unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012033{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080012034 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000012035
Benjamin Petersonf6622c82012-04-09 14:53:07 -040012036#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050012037 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040012038#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012039 if (_PyUnicode_HASH(self) != -1)
12040 return _PyUnicode_HASH(self);
12041 if (PyUnicode_READY(self) == -1)
12042 return -1;
animalizea1d14252019-01-02 20:16:06 +080012043
Christian Heimes985ecdc2013-11-20 11:46:18 +010012044 x = _Py_HashBytes(PyUnicode_DATA(self),
12045 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012046 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000012047 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012048}
12049
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012050PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012051 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012052\n\
oldkaa0735f2018-02-02 16:52:55 +080012053Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012054such that sub is contained within S[start:end]. Optional\n\
12055arguments start and end are interpreted as in slice notation.\n\
12056\n\
12057Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012058
12059static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012060unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012061{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012062 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000012063 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012064 PyObject *substring = NULL;
12065 Py_ssize_t start = 0;
12066 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012067
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012068 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012069 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012071 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012072 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012073
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012074 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012075
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012076 if (result == -2)
12077 return NULL;
12078
Guido van Rossumd57fd912000-03-10 22:53:23 +000012079 if (result < 0) {
12080 PyErr_SetString(PyExc_ValueError, "substring not found");
12081 return NULL;
12082 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012083
Christian Heimes217cfd12007-12-02 14:31:20 +000012084 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012085}
12086
INADA Naoki3ae20562017-01-16 20:41:20 +090012087/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090012088str.isascii as unicode_isascii
12089
12090Return True if all characters in the string are ASCII, False otherwise.
12091
12092ASCII characters have code points in the range U+0000-U+007F.
12093Empty string is ASCII too.
12094[clinic start generated code]*/
12095
12096static PyObject *
12097unicode_isascii_impl(PyObject *self)
12098/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
12099{
12100 if (PyUnicode_READY(self) == -1) {
12101 return NULL;
12102 }
12103 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
12104}
12105
12106/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090012107str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012108
INADA Naoki3ae20562017-01-16 20:41:20 +090012109Return True if the string is a lowercase string, False otherwise.
12110
12111A string is lowercase if all cased characters in the string are lowercase and
12112there is at least one cased character in the string.
12113[clinic start generated code]*/
12114
12115static PyObject *
12116unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012117/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012118{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012119 Py_ssize_t i, length;
12120 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012121 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012122 int cased;
12123
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012124 if (PyUnicode_READY(self) == -1)
12125 return NULL;
12126 length = PyUnicode_GET_LENGTH(self);
12127 kind = PyUnicode_KIND(self);
12128 data = PyUnicode_DATA(self);
12129
Guido van Rossumd57fd912000-03-10 22:53:23 +000012130 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012131 if (length == 1)
12132 return PyBool_FromLong(
12133 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012134
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012135 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012136 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012137 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012138
Guido van Rossumd57fd912000-03-10 22:53:23 +000012139 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012140 for (i = 0; i < length; i++) {
12141 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012142
Benjamin Peterson29060642009-01-31 22:14:21 +000012143 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012144 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012145 else if (!cased && Py_UNICODE_ISLOWER(ch))
12146 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012147 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012148 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012149}
12150
INADA Naoki3ae20562017-01-16 20:41:20 +090012151/*[clinic input]
12152str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000012153
INADA Naoki3ae20562017-01-16 20:41:20 +090012154Return True if the string is an uppercase string, False otherwise.
12155
12156A string is uppercase if all cased characters in the string are uppercase and
12157there is at least one cased character in the string.
12158[clinic start generated code]*/
12159
12160static PyObject *
12161unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012162/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012163{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012164 Py_ssize_t i, length;
12165 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012166 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012167 int cased;
12168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012169 if (PyUnicode_READY(self) == -1)
12170 return NULL;
12171 length = PyUnicode_GET_LENGTH(self);
12172 kind = PyUnicode_KIND(self);
12173 data = PyUnicode_DATA(self);
12174
Guido van Rossumd57fd912000-03-10 22:53:23 +000012175 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012176 if (length == 1)
12177 return PyBool_FromLong(
12178 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012179
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012180 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012181 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012182 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012183
Guido van Rossumd57fd912000-03-10 22:53:23 +000012184 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012185 for (i = 0; i < length; i++) {
12186 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012187
Benjamin Peterson29060642009-01-31 22:14:21 +000012188 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012189 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012190 else if (!cased && Py_UNICODE_ISUPPER(ch))
12191 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012192 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012193 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194}
12195
INADA Naoki3ae20562017-01-16 20:41:20 +090012196/*[clinic input]
12197str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000012198
INADA Naoki3ae20562017-01-16 20:41:20 +090012199Return True if the string is a title-cased string, False otherwise.
12200
12201In a title-cased string, upper- and title-case characters may only
12202follow uncased characters and lowercase characters only cased ones.
12203[clinic start generated code]*/
12204
12205static PyObject *
12206unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012207/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012208{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012209 Py_ssize_t i, length;
12210 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012211 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012212 int cased, previous_is_cased;
12213
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012214 if (PyUnicode_READY(self) == -1)
12215 return NULL;
12216 length = PyUnicode_GET_LENGTH(self);
12217 kind = PyUnicode_KIND(self);
12218 data = PyUnicode_DATA(self);
12219
Guido van Rossumd57fd912000-03-10 22:53:23 +000012220 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012221 if (length == 1) {
12222 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12223 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12224 (Py_UNICODE_ISUPPER(ch) != 0));
12225 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012226
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012227 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012228 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012229 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012230
Guido van Rossumd57fd912000-03-10 22:53:23 +000012231 cased = 0;
12232 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012233 for (i = 0; i < length; i++) {
12234 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012235
Benjamin Peterson29060642009-01-31 22:14:21 +000012236 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12237 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012238 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012239 previous_is_cased = 1;
12240 cased = 1;
12241 }
12242 else if (Py_UNICODE_ISLOWER(ch)) {
12243 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012244 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012245 previous_is_cased = 1;
12246 cased = 1;
12247 }
12248 else
12249 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012250 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012251 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012252}
12253
INADA Naoki3ae20562017-01-16 20:41:20 +090012254/*[clinic input]
12255str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256
INADA Naoki3ae20562017-01-16 20:41:20 +090012257Return True if the string is a whitespace string, False otherwise.
12258
12259A string is whitespace if all characters in the string are whitespace and there
12260is at least one character in the string.
12261[clinic start generated code]*/
12262
12263static PyObject *
12264unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012265/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012266{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012267 Py_ssize_t i, length;
12268 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012269 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012270
12271 if (PyUnicode_READY(self) == -1)
12272 return NULL;
12273 length = PyUnicode_GET_LENGTH(self);
12274 kind = PyUnicode_KIND(self);
12275 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276
Guido van Rossumd57fd912000-03-10 22:53:23 +000012277 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012278 if (length == 1)
12279 return PyBool_FromLong(
12280 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012281
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012282 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012283 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012284 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012285
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012286 for (i = 0; i < length; i++) {
12287 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012288 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012289 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012290 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012291 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012292}
12293
INADA Naoki3ae20562017-01-16 20:41:20 +090012294/*[clinic input]
12295str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012296
INADA Naoki3ae20562017-01-16 20:41:20 +090012297Return True if the string is an alphabetic string, False otherwise.
12298
12299A string is alphabetic if all characters in the string are alphabetic and there
12300is at least one character in the string.
12301[clinic start generated code]*/
12302
12303static PyObject *
12304unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012305/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012306{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012307 Py_ssize_t i, length;
12308 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012309 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012310
12311 if (PyUnicode_READY(self) == -1)
12312 return NULL;
12313 length = PyUnicode_GET_LENGTH(self);
12314 kind = PyUnicode_KIND(self);
12315 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012316
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012317 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012318 if (length == 1)
12319 return PyBool_FromLong(
12320 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012321
12322 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012323 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012324 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012325
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012326 for (i = 0; i < length; i++) {
12327 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012328 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012329 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012330 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012331}
12332
INADA Naoki3ae20562017-01-16 20:41:20 +090012333/*[clinic input]
12334str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012335
INADA Naoki3ae20562017-01-16 20:41:20 +090012336Return True if the string is an alpha-numeric string, False otherwise.
12337
12338A string is alpha-numeric if all characters in the string are alpha-numeric and
12339there is at least one character in the string.
12340[clinic start generated code]*/
12341
12342static PyObject *
12343unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012344/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012345{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012346 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012347 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012348 Py_ssize_t len, i;
12349
12350 if (PyUnicode_READY(self) == -1)
12351 return NULL;
12352
12353 kind = PyUnicode_KIND(self);
12354 data = PyUnicode_DATA(self);
12355 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012356
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012357 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012358 if (len == 1) {
12359 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12360 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12361 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012362
12363 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012364 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012365 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012366
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012367 for (i = 0; i < len; i++) {
12368 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012369 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012370 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012371 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012372 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012373}
12374
INADA Naoki3ae20562017-01-16 20:41:20 +090012375/*[clinic input]
12376str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012377
INADA Naoki3ae20562017-01-16 20:41:20 +090012378Return True if the string is a decimal string, False otherwise.
12379
12380A string is a decimal string if all characters in the string are decimal and
12381there is at least one character in the string.
12382[clinic start generated code]*/
12383
12384static PyObject *
12385unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012386/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012387{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012388 Py_ssize_t i, length;
12389 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012390 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012391
12392 if (PyUnicode_READY(self) == -1)
12393 return NULL;
12394 length = PyUnicode_GET_LENGTH(self);
12395 kind = PyUnicode_KIND(self);
12396 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012397
Guido van Rossumd57fd912000-03-10 22:53:23 +000012398 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012399 if (length == 1)
12400 return PyBool_FromLong(
12401 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012402
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012403 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012404 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012405 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012406
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012407 for (i = 0; i < length; i++) {
12408 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012409 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012410 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012411 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012412}
12413
INADA Naoki3ae20562017-01-16 20:41:20 +090012414/*[clinic input]
12415str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012416
INADA Naoki3ae20562017-01-16 20:41:20 +090012417Return True if the string is a digit string, False otherwise.
12418
12419A string is a digit string if all characters in the string are digits and there
12420is at least one character in the string.
12421[clinic start generated code]*/
12422
12423static PyObject *
12424unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012425/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012426{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012427 Py_ssize_t i, length;
12428 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012429 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012430
12431 if (PyUnicode_READY(self) == -1)
12432 return NULL;
12433 length = PyUnicode_GET_LENGTH(self);
12434 kind = PyUnicode_KIND(self);
12435 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012436
Guido van Rossumd57fd912000-03-10 22:53:23 +000012437 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012438 if (length == 1) {
12439 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12440 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12441 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012442
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012443 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012444 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012445 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012447 for (i = 0; i < length; i++) {
12448 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012449 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012450 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012451 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012452}
12453
INADA Naoki3ae20562017-01-16 20:41:20 +090012454/*[clinic input]
12455str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012456
INADA Naoki3ae20562017-01-16 20:41:20 +090012457Return True if the string is a numeric string, False otherwise.
12458
12459A string is numeric if all characters in the string are numeric and there is at
12460least one character in the string.
12461[clinic start generated code]*/
12462
12463static PyObject *
12464unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012465/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012466{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012467 Py_ssize_t i, length;
12468 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012469 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012470
12471 if (PyUnicode_READY(self) == -1)
12472 return NULL;
12473 length = PyUnicode_GET_LENGTH(self);
12474 kind = PyUnicode_KIND(self);
12475 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012476
Guido van Rossumd57fd912000-03-10 22:53:23 +000012477 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012478 if (length == 1)
12479 return PyBool_FromLong(
12480 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012481
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012482 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012483 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012484 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012485
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012486 for (i = 0; i < length; i++) {
12487 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012488 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012489 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012490 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012491}
12492
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012493Py_ssize_t
12494_PyUnicode_ScanIdentifier(PyObject *self)
Martin v. Löwis47383402007-08-15 07:32:56 +000012495{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012496 Py_ssize_t i;
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012497 if (PyUnicode_READY(self) == -1)
12498 return -1;
Martin v. Löwis47383402007-08-15 07:32:56 +000012499
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012500 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012501 if (len == 0) {
12502 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012503 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012504 }
12505
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012506 int kind = PyUnicode_KIND(self);
12507 const void *data = PyUnicode_DATA(self);
12508 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Martin v. Löwis47383402007-08-15 07:32:56 +000012509 /* PEP 3131 says that the first character must be in
12510 XID_Start and subsequent characters in XID_Continue,
12511 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012512 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012513 letters, digits, underscore). However, given the current
12514 definition of XID_Start and XID_Continue, it is sufficient
12515 to check just for these, except that _ must be allowed
12516 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012517 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012518 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012519 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012520
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012521 for (i = 1; i < len; i++) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012522 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012523 if (!_PyUnicode_IsXidContinue(ch)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012524 return i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012525 }
12526 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012527 return i;
12528}
12529
12530int
12531PyUnicode_IsIdentifier(PyObject *self)
12532{
12533 if (PyUnicode_IS_READY(self)) {
12534 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12535 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12536 /* an empty string is not a valid identifier */
12537 return len && i == len;
12538 }
12539 else {
Inada Naoki2c4928d2020-06-17 20:09:44 +090012540_Py_COMP_DIAG_PUSH
12541_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012542 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012543 if (len == 0) {
12544 /* an empty string is not a valid identifier */
12545 return 0;
12546 }
12547
12548 const wchar_t *wstr = _PyUnicode_WSTR(self);
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012549 Py_UCS4 ch = wstr[i++];
12550#if SIZEOF_WCHAR_T == 2
12551 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12552 && i < len
12553 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12554 {
12555 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12556 i++;
12557 }
12558#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012559 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12560 return 0;
12561 }
12562
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012563 while (i < len) {
12564 ch = wstr[i++];
12565#if SIZEOF_WCHAR_T == 2
12566 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12567 && i < len
12568 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12569 {
12570 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12571 i++;
12572 }
12573#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012574 if (!_PyUnicode_IsXidContinue(ch)) {
12575 return 0;
12576 }
12577 }
12578 return 1;
Inada Naoki2c4928d2020-06-17 20:09:44 +090012579_Py_COMP_DIAG_POP
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012580 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012581}
12582
INADA Naoki3ae20562017-01-16 20:41:20 +090012583/*[clinic input]
12584str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012585
INADA Naoki3ae20562017-01-16 20:41:20 +090012586Return True if the string is a valid Python identifier, False otherwise.
12587
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012588Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012589such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012590[clinic start generated code]*/
12591
12592static PyObject *
12593unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012594/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012595{
12596 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12597}
12598
INADA Naoki3ae20562017-01-16 20:41:20 +090012599/*[clinic input]
12600str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012601
INADA Naoki3ae20562017-01-16 20:41:20 +090012602Return True if the string is printable, False otherwise.
12603
12604A string is printable if all of its characters are considered printable in
12605repr() or if it is empty.
12606[clinic start generated code]*/
12607
12608static PyObject *
12609unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012610/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012611{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012612 Py_ssize_t i, length;
12613 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012614 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012615
12616 if (PyUnicode_READY(self) == -1)
12617 return NULL;
12618 length = PyUnicode_GET_LENGTH(self);
12619 kind = PyUnicode_KIND(self);
12620 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012621
12622 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012623 if (length == 1)
12624 return PyBool_FromLong(
12625 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012626
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012627 for (i = 0; i < length; i++) {
12628 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012629 Py_RETURN_FALSE;
12630 }
12631 }
12632 Py_RETURN_TRUE;
12633}
12634
INADA Naoki3ae20562017-01-16 20:41:20 +090012635/*[clinic input]
12636str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012637
INADA Naoki3ae20562017-01-16 20:41:20 +090012638 iterable: object
12639 /
12640
12641Concatenate any number of strings.
12642
Martin Panter91a88662017-01-24 00:30:06 +000012643The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012644The result is returned as a new string.
12645
12646Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12647[clinic start generated code]*/
12648
12649static PyObject *
12650unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012651/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012652{
INADA Naoki3ae20562017-01-16 20:41:20 +090012653 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012654}
12655
Martin v. Löwis18e16552006-02-15 17:27:45 +000012656static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012657unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012658{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012659 if (PyUnicode_READY(self) == -1)
12660 return -1;
12661 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012662}
12663
INADA Naoki3ae20562017-01-16 20:41:20 +090012664/*[clinic input]
12665str.ljust as unicode_ljust
12666
12667 width: Py_ssize_t
12668 fillchar: Py_UCS4 = ' '
12669 /
12670
12671Return a left-justified string of length width.
12672
12673Padding is done using the specified fill character (default is a space).
12674[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012675
12676static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012677unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12678/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012679{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012680 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012681 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012682
Victor Stinnerc4b49542011-12-11 22:44:26 +010012683 if (PyUnicode_GET_LENGTH(self) >= width)
12684 return unicode_result_unchanged(self);
12685
12686 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012687}
12688
INADA Naoki3ae20562017-01-16 20:41:20 +090012689/*[clinic input]
12690str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012691
INADA Naoki3ae20562017-01-16 20:41:20 +090012692Return a copy of the string converted to lowercase.
12693[clinic start generated code]*/
12694
12695static PyObject *
12696unicode_lower_impl(PyObject *self)
12697/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012698{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012699 if (PyUnicode_READY(self) == -1)
12700 return NULL;
12701 if (PyUnicode_IS_ASCII(self))
12702 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012703 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012704}
12705
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012706#define LEFTSTRIP 0
12707#define RIGHTSTRIP 1
12708#define BOTHSTRIP 2
12709
12710/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012711static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012712
INADA Naoki3ae20562017-01-16 20:41:20 +090012713#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012714
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012715/* externally visible for str.strip(unicode) */
12716PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012717_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012718{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012719 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012720 int kind;
12721 Py_ssize_t i, j, len;
12722 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012723 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012724
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012725 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12726 return NULL;
12727
12728 kind = PyUnicode_KIND(self);
12729 data = PyUnicode_DATA(self);
12730 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012731 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012732 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12733 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012734 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012735
Benjamin Peterson14339b62009-01-31 16:36:08 +000012736 i = 0;
12737 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012738 while (i < len) {
12739 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12740 if (!BLOOM(sepmask, ch))
12741 break;
12742 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12743 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012744 i++;
12745 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012746 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012747
Benjamin Peterson14339b62009-01-31 16:36:08 +000012748 j = len;
12749 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012750 j--;
12751 while (j >= i) {
12752 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12753 if (!BLOOM(sepmask, ch))
12754 break;
12755 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12756 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012757 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012758 }
12759
Benjamin Peterson29060642009-01-31 22:14:21 +000012760 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012761 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012762
Victor Stinner7931d9a2011-11-04 00:22:48 +010012763 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012764}
12765
12766PyObject*
12767PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12768{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012769 const unsigned char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012770 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012771 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012772
Victor Stinnerde636f32011-10-01 03:55:54 +020012773 if (PyUnicode_READY(self) == -1)
12774 return NULL;
12775
Victor Stinner684d5fd2012-05-03 02:32:34 +020012776 length = PyUnicode_GET_LENGTH(self);
12777 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012778
Victor Stinner684d5fd2012-05-03 02:32:34 +020012779 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012780 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012781
Victor Stinnerde636f32011-10-01 03:55:54 +020012782 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012783 PyErr_SetString(PyExc_IndexError, "string index out of range");
12784 return NULL;
12785 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012786 if (start >= length || end < start)
12787 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012788
Victor Stinner684d5fd2012-05-03 02:32:34 +020012789 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012790 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012791 data = PyUnicode_1BYTE_DATA(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012792 return _PyUnicode_FromASCII((const char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012793 }
12794 else {
12795 kind = PyUnicode_KIND(self);
12796 data = PyUnicode_1BYTE_DATA(self);
12797 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012798 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012799 length);
12800 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012801}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012802
12803static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012804do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012805{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012806 Py_ssize_t len, i, j;
12807
12808 if (PyUnicode_READY(self) == -1)
12809 return NULL;
12810
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012811 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012812
Victor Stinnercc7af722013-04-09 22:39:24 +020012813 if (PyUnicode_IS_ASCII(self)) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012814 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Victor Stinnercc7af722013-04-09 22:39:24 +020012815
12816 i = 0;
12817 if (striptype != RIGHTSTRIP) {
12818 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012819 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012820 if (!_Py_ascii_whitespace[ch])
12821 break;
12822 i++;
12823 }
12824 }
12825
12826 j = len;
12827 if (striptype != LEFTSTRIP) {
12828 j--;
12829 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012830 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012831 if (!_Py_ascii_whitespace[ch])
12832 break;
12833 j--;
12834 }
12835 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012836 }
12837 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012838 else {
12839 int kind = PyUnicode_KIND(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012840 const void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012841
Victor Stinnercc7af722013-04-09 22:39:24 +020012842 i = 0;
12843 if (striptype != RIGHTSTRIP) {
12844 while (i < len) {
12845 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12846 if (!Py_UNICODE_ISSPACE(ch))
12847 break;
12848 i++;
12849 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012850 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012851
12852 j = len;
12853 if (striptype != LEFTSTRIP) {
12854 j--;
12855 while (j >= i) {
12856 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12857 if (!Py_UNICODE_ISSPACE(ch))
12858 break;
12859 j--;
12860 }
12861 j++;
12862 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012863 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012864
Victor Stinner7931d9a2011-11-04 00:22:48 +010012865 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012866}
12867
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012868
12869static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012870do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012871{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012872 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012873 if (PyUnicode_Check(sep))
12874 return _PyUnicode_XStrip(self, striptype, sep);
12875 else {
12876 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012877 "%s arg must be None or str",
12878 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012879 return NULL;
12880 }
12881 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012882
Benjamin Peterson14339b62009-01-31 16:36:08 +000012883 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012884}
12885
12886
INADA Naoki3ae20562017-01-16 20:41:20 +090012887/*[clinic input]
12888str.strip as unicode_strip
12889
12890 chars: object = None
12891 /
12892
Zachary Ware09895c22019-10-09 16:09:00 -050012893Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012894
12895If chars is given and not None, remove characters in chars instead.
12896[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012897
12898static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012899unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012900/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012901{
INADA Naoki3ae20562017-01-16 20:41:20 +090012902 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012903}
12904
12905
INADA Naoki3ae20562017-01-16 20:41:20 +090012906/*[clinic input]
12907str.lstrip as unicode_lstrip
12908
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012909 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012910 /
12911
12912Return a copy of the string with leading whitespace removed.
12913
12914If chars is given and not None, remove characters in chars instead.
12915[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012916
12917static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012918unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012919/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012920{
INADA Naoki3ae20562017-01-16 20:41:20 +090012921 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012922}
12923
12924
INADA Naoki3ae20562017-01-16 20:41:20 +090012925/*[clinic input]
12926str.rstrip as unicode_rstrip
12927
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012928 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012929 /
12930
12931Return a copy of the string with trailing whitespace removed.
12932
12933If chars is given and not None, remove characters in chars instead.
12934[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012935
12936static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012937unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012938/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012939{
INADA Naoki3ae20562017-01-16 20:41:20 +090012940 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012941}
12942
12943
Guido van Rossumd57fd912000-03-10 22:53:23 +000012944static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012945unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012946{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012947 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012948 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012949
Serhiy Storchaka05997252013-01-26 12:14:02 +020012950 if (len < 1)
12951 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012952
Victor Stinnerc4b49542011-12-11 22:44:26 +010012953 /* no repeat, return original string */
12954 if (len == 1)
12955 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012956
Benjamin Petersonbac79492012-01-14 13:34:47 -050012957 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012958 return NULL;
12959
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012960 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012961 PyErr_SetString(PyExc_OverflowError,
12962 "repeated string is too long");
12963 return NULL;
12964 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012965 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012966
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012967 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012968 if (!u)
12969 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012970 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012971
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012972 if (PyUnicode_GET_LENGTH(str) == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012973 int kind = PyUnicode_KIND(str);
12974 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012975 if (kind == PyUnicode_1BYTE_KIND) {
12976 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012977 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012978 }
12979 else if (kind == PyUnicode_2BYTE_KIND) {
12980 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012981 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012982 ucs2[n] = fill_char;
12983 } else {
12984 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12985 assert(kind == PyUnicode_4BYTE_KIND);
12986 for (n = 0; n < len; ++n)
12987 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012988 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012989 }
12990 else {
12991 /* number of characters copied this far */
12992 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012993 Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012994 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012995 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012996 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012997 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012998 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012999 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013000 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000013001 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000013002 }
13003
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013004 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013005 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013006}
13007
Alexander Belopolsky40018472011-02-26 01:02:56 +000013008PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013009PyUnicode_Replace(PyObject *str,
13010 PyObject *substr,
13011 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000013012 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013013{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013014 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
13015 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013016 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013017 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013018}
13019
INADA Naoki3ae20562017-01-16 20:41:20 +090013020/*[clinic input]
13021str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000013022
INADA Naoki3ae20562017-01-16 20:41:20 +090013023 old: unicode
13024 new: unicode
13025 count: Py_ssize_t = -1
13026 Maximum number of occurrences to replace.
13027 -1 (the default value) means replace all occurrences.
13028 /
13029
13030Return a copy with all occurrences of substring old replaced by new.
13031
13032If the optional argument count is given, only the first count occurrences are
13033replaced.
13034[clinic start generated code]*/
13035
13036static PyObject *
13037unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
13038 Py_ssize_t count)
13039/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013040{
Benjamin Peterson22a29702012-01-02 09:00:30 -060013041 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013042 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090013043 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013044}
13045
sweeneydea81849b2020-04-22 17:05:48 -040013046/*[clinic input]
13047str.removeprefix as unicode_removeprefix
13048
13049 prefix: unicode
13050 /
13051
13052Return a str with the given prefix string removed if present.
13053
13054If the string starts with the prefix string, return string[len(prefix):].
13055Otherwise, return a copy of the original string.
13056[clinic start generated code]*/
13057
13058static PyObject *
13059unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
13060/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
13061{
13062 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
13063 if (match == -1) {
13064 return NULL;
13065 }
13066 if (match) {
13067 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
13068 PyUnicode_GET_LENGTH(self));
13069 }
13070 return unicode_result_unchanged(self);
13071}
13072
13073/*[clinic input]
13074str.removesuffix as unicode_removesuffix
13075
13076 suffix: unicode
13077 /
13078
13079Return a str with the given suffix string removed if present.
13080
13081If the string ends with the suffix string and that suffix is not empty,
13082return string[:-len(suffix)]. Otherwise, return a copy of the original
13083string.
13084[clinic start generated code]*/
13085
13086static PyObject *
13087unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
13088/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
13089{
13090 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
13091 if (match == -1) {
13092 return NULL;
13093 }
13094 if (match) {
13095 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
13096 - PyUnicode_GET_LENGTH(suffix));
13097 }
13098 return unicode_result_unchanged(self);
13099}
13100
Alexander Belopolsky40018472011-02-26 01:02:56 +000013101static PyObject *
13102unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013103{
Walter Dörwald79e913e2007-05-12 11:08:06 +000013104 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013105 Py_ssize_t isize;
13106 Py_ssize_t osize, squote, dquote, i, o;
13107 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020013108 int ikind, okind, unchanged;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013109 const void *idata;
13110 void *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000013111
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013112 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000013113 return NULL;
13114
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013115 isize = PyUnicode_GET_LENGTH(unicode);
13116 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000013117
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013118 /* Compute length of output, quote characters, and
13119 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020013120 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013121 max = 127;
13122 squote = dquote = 0;
13123 ikind = PyUnicode_KIND(unicode);
13124 for (i = 0; i < isize; i++) {
13125 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040013126 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013127 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040013128 case '\'': squote++; break;
13129 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013130 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040013131 incr = 2;
13132 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013133 default:
13134 /* Fast-path ASCII */
13135 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013136 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013137 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013138 ;
13139 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013140 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013141 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013142 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013143 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013144 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013145 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040013146 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013147 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040013148 if (osize > PY_SSIZE_T_MAX - incr) {
13149 PyErr_SetString(PyExc_OverflowError,
13150 "string is too long to generate repr");
13151 return NULL;
13152 }
13153 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013154 }
13155
13156 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020013157 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013158 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020013159 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013160 if (dquote)
13161 /* Both squote and dquote present. Use squote,
13162 and escape them */
13163 osize += squote;
13164 else
13165 quote = '"';
13166 }
Victor Stinner55c08782013-04-14 18:45:39 +020013167 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013168
13169 repr = PyUnicode_New(osize, max);
13170 if (repr == NULL)
13171 return NULL;
13172 okind = PyUnicode_KIND(repr);
13173 odata = PyUnicode_DATA(repr);
13174
13175 PyUnicode_WRITE(okind, odata, 0, quote);
13176 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020013177 if (unchanged) {
13178 _PyUnicode_FastCopyCharacters(repr, 1,
13179 unicode, 0,
13180 isize);
13181 }
13182 else {
13183 for (i = 0, o = 1; i < isize; i++) {
13184 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013185
Victor Stinner55c08782013-04-14 18:45:39 +020013186 /* Escape quotes and backslashes */
13187 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000013188 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013189 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020013190 continue;
13191 }
13192
13193 /* Map special whitespace to '\t', \n', '\r' */
13194 if (ch == '\t') {
13195 PyUnicode_WRITE(okind, odata, o++, '\\');
13196 PyUnicode_WRITE(okind, odata, o++, 't');
13197 }
13198 else if (ch == '\n') {
13199 PyUnicode_WRITE(okind, odata, o++, '\\');
13200 PyUnicode_WRITE(okind, odata, o++, 'n');
13201 }
13202 else if (ch == '\r') {
13203 PyUnicode_WRITE(okind, odata, o++, '\\');
13204 PyUnicode_WRITE(okind, odata, o++, 'r');
13205 }
13206
13207 /* Map non-printable US ASCII to '\xhh' */
13208 else if (ch < ' ' || ch == 0x7F) {
13209 PyUnicode_WRITE(okind, odata, o++, '\\');
13210 PyUnicode_WRITE(okind, odata, o++, 'x');
13211 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13212 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13213 }
13214
13215 /* Copy ASCII characters as-is */
13216 else if (ch < 0x7F) {
13217 PyUnicode_WRITE(okind, odata, o++, ch);
13218 }
13219
13220 /* Non-ASCII characters */
13221 else {
13222 /* Map Unicode whitespace and control characters
13223 (categories Z* and C* except ASCII space)
13224 */
13225 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13226 PyUnicode_WRITE(okind, odata, o++, '\\');
13227 /* Map 8-bit characters to '\xhh' */
13228 if (ch <= 0xff) {
13229 PyUnicode_WRITE(okind, odata, o++, 'x');
13230 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13231 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13232 }
13233 /* Map 16-bit characters to '\uxxxx' */
13234 else if (ch <= 0xffff) {
13235 PyUnicode_WRITE(okind, odata, o++, 'u');
13236 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13237 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13238 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13239 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13240 }
13241 /* Map 21-bit characters to '\U00xxxxxx' */
13242 else {
13243 PyUnicode_WRITE(okind, odata, o++, 'U');
13244 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13245 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13246 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13247 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13248 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13249 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13250 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13251 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13252 }
13253 }
13254 /* Copy characters as-is */
13255 else {
13256 PyUnicode_WRITE(okind, odata, o++, ch);
13257 }
Georg Brandl559e5d72008-06-11 18:37:52 +000013258 }
13259 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000013260 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013261 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020013262 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000013263 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013264}
13265
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013266PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013267 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013268\n\
13269Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080013270such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013271arguments start and end are interpreted as in slice notation.\n\
13272\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013273Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013274
13275static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013276unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013277{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013278 /* initialize variables to prevent gcc warning */
13279 PyObject *substring = NULL;
13280 Py_ssize_t start = 0;
13281 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013282 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013283
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013284 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013285 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013286
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013287 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013288 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013289
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013290 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013291
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013292 if (result == -2)
13293 return NULL;
13294
Christian Heimes217cfd12007-12-02 14:31:20 +000013295 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013296}
13297
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013298PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013299 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013300\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070013301Return the highest index in S where substring sub is found,\n\
13302such that sub is contained within S[start:end]. Optional\n\
13303arguments start and end are interpreted as in slice notation.\n\
13304\n\
13305Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013306
13307static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013308unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013309{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013310 /* initialize variables to prevent gcc warning */
13311 PyObject *substring = NULL;
13312 Py_ssize_t start = 0;
13313 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013314 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013315
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013316 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013317 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013318
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013319 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013320 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013321
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013322 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013324 if (result == -2)
13325 return NULL;
13326
Guido van Rossumd57fd912000-03-10 22:53:23 +000013327 if (result < 0) {
13328 PyErr_SetString(PyExc_ValueError, "substring not found");
13329 return NULL;
13330 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013331
Christian Heimes217cfd12007-12-02 14:31:20 +000013332 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013333}
13334
INADA Naoki3ae20562017-01-16 20:41:20 +090013335/*[clinic input]
13336str.rjust as unicode_rjust
13337
13338 width: Py_ssize_t
13339 fillchar: Py_UCS4 = ' '
13340 /
13341
13342Return a right-justified string of length width.
13343
13344Padding is done using the specified fill character (default is a space).
13345[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013346
13347static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013348unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13349/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013350{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013351 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013352 return NULL;
13353
Victor Stinnerc4b49542011-12-11 22:44:26 +010013354 if (PyUnicode_GET_LENGTH(self) >= width)
13355 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013356
Victor Stinnerc4b49542011-12-11 22:44:26 +010013357 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013358}
13359
Alexander Belopolsky40018472011-02-26 01:02:56 +000013360PyObject *
13361PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013362{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013363 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013364 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013365
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013366 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013367}
13368
INADA Naoki3ae20562017-01-16 20:41:20 +090013369/*[clinic input]
13370str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013371
INADA Naoki3ae20562017-01-16 20:41:20 +090013372 sep: object = None
13373 The delimiter according which to split the string.
13374 None (the default value) means split according to any whitespace,
13375 and discard empty strings from the result.
13376 maxsplit: Py_ssize_t = -1
13377 Maximum number of splits to do.
13378 -1 (the default value) means no limit.
13379
13380Return a list of the words in the string, using sep as the delimiter string.
13381[clinic start generated code]*/
13382
13383static PyObject *
13384unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13385/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013386{
INADA Naoki3ae20562017-01-16 20:41:20 +090013387 if (sep == Py_None)
13388 return split(self, NULL, maxsplit);
13389 if (PyUnicode_Check(sep))
13390 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013391
Victor Stinner998b8062018-09-12 00:23:25 +020013392 PyErr_Format(PyExc_TypeError,
13393 "must be str or None, not %.100s",
13394 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013395 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013396}
13397
Thomas Wouters477c8d52006-05-27 19:21:47 +000013398PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013399PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013400{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013401 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013402 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013403 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013404 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013405
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013406 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013407 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013408
Victor Stinner14f8f022011-10-05 20:58:25 +020013409 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013410 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013411 len1 = PyUnicode_GET_LENGTH(str_obj);
13412 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013413 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013414 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013415 return PyTuple_Pack(3, str_obj, empty, empty);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013416 }
13417 buf1 = PyUnicode_DATA(str_obj);
13418 buf2 = PyUnicode_DATA(sep_obj);
13419 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013420 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013421 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013422 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013423 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013424
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013425 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013426 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013427 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13428 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13429 else
13430 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013431 break;
13432 case PyUnicode_2BYTE_KIND:
13433 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13434 break;
13435 case PyUnicode_4BYTE_KIND:
13436 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13437 break;
13438 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013439 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013440 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013441
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013442 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013443 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013444 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013445
13446 return out;
13447}
13448
13449
13450PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013451PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013452{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013453 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013454 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013455 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013456 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013457
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013458 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013459 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013460
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013461 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013462 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013463 len1 = PyUnicode_GET_LENGTH(str_obj);
13464 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013465 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013466 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013467 return PyTuple_Pack(3, empty, empty, str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013468 }
13469 buf1 = PyUnicode_DATA(str_obj);
13470 buf2 = PyUnicode_DATA(sep_obj);
13471 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013472 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013473 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013474 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013475 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013476
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013477 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013478 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013479 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13480 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13481 else
13482 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013483 break;
13484 case PyUnicode_2BYTE_KIND:
13485 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13486 break;
13487 case PyUnicode_4BYTE_KIND:
13488 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13489 break;
13490 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013491 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013492 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013493
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013494 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013495 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013496 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013497
13498 return out;
13499}
13500
INADA Naoki3ae20562017-01-16 20:41:20 +090013501/*[clinic input]
13502str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013503
INADA Naoki3ae20562017-01-16 20:41:20 +090013504 sep: object
13505 /
13506
13507Partition the string into three parts using the given separator.
13508
13509This will search for the separator in the string. If the separator is found,
13510returns a 3-tuple containing the part before the separator, the separator
13511itself, and the part after it.
13512
13513If the separator is not found, returns a 3-tuple containing the original string
13514and two empty strings.
13515[clinic start generated code]*/
13516
13517static PyObject *
13518unicode_partition(PyObject *self, PyObject *sep)
13519/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013520{
INADA Naoki3ae20562017-01-16 20:41:20 +090013521 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013522}
13523
INADA Naoki3ae20562017-01-16 20:41:20 +090013524/*[clinic input]
13525str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013526
INADA Naoki3ae20562017-01-16 20:41:20 +090013527Partition the string into three parts using the given separator.
13528
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013529This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013530the separator is found, returns a 3-tuple containing the part before the
13531separator, the separator itself, and the part after it.
13532
13533If the separator is not found, returns a 3-tuple containing two empty strings
13534and the original string.
13535[clinic start generated code]*/
13536
13537static PyObject *
13538unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013539/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013540{
INADA Naoki3ae20562017-01-16 20:41:20 +090013541 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013542}
13543
Alexander Belopolsky40018472011-02-26 01:02:56 +000013544PyObject *
13545PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013546{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013547 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013548 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013549
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013550 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013551}
13552
INADA Naoki3ae20562017-01-16 20:41:20 +090013553/*[clinic input]
13554str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013555
INADA Naoki3ae20562017-01-16 20:41:20 +090013556Return a list of the words in the string, using sep as the delimiter string.
13557
13558Splits are done starting at the end of the string and working to the front.
13559[clinic start generated code]*/
13560
13561static PyObject *
13562unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13563/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013564{
INADA Naoki3ae20562017-01-16 20:41:20 +090013565 if (sep == Py_None)
13566 return rsplit(self, NULL, maxsplit);
13567 if (PyUnicode_Check(sep))
13568 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013569
Victor Stinner998b8062018-09-12 00:23:25 +020013570 PyErr_Format(PyExc_TypeError,
13571 "must be str or None, not %.100s",
13572 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013573 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013574}
13575
INADA Naoki3ae20562017-01-16 20:41:20 +090013576/*[clinic input]
13577str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013578
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013579 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013580
13581Return a list of the lines in the string, breaking at line boundaries.
13582
13583Line breaks are not included in the resulting list unless keepends is given and
13584true.
13585[clinic start generated code]*/
13586
13587static PyObject *
13588unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013589/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013590{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013591 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013592}
13593
13594static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013595PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013596{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013597 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013598}
13599
INADA Naoki3ae20562017-01-16 20:41:20 +090013600/*[clinic input]
13601str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013602
INADA Naoki3ae20562017-01-16 20:41:20 +090013603Convert uppercase characters to lowercase and lowercase characters to uppercase.
13604[clinic start generated code]*/
13605
13606static PyObject *
13607unicode_swapcase_impl(PyObject *self)
13608/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013609{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013610 if (PyUnicode_READY(self) == -1)
13611 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013612 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013613}
13614
Larry Hastings61272b72014-01-07 12:41:53 -080013615/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013616
Larry Hastings31826802013-10-19 00:09:25 -070013617@staticmethod
13618str.maketrans as unicode_maketrans
13619
13620 x: object
13621
13622 y: unicode=NULL
13623
13624 z: unicode=NULL
13625
13626 /
13627
13628Return a translation table usable for str.translate().
13629
13630If there is only one argument, it must be a dictionary mapping Unicode
13631ordinals (integers) or characters to Unicode ordinals, strings or None.
13632Character keys will be then converted to ordinals.
13633If there are two arguments, they must be strings of equal length, and
13634in the resulting dictionary, each character in x will be mapped to the
13635character at the same position in y. If there is a third argument, it
13636must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013637[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013638
Larry Hastings31826802013-10-19 00:09:25 -070013639static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013640unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013641/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013642{
Georg Brandlceee0772007-11-27 23:48:05 +000013643 PyObject *new = NULL, *key, *value;
13644 Py_ssize_t i = 0;
13645 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013646
Georg Brandlceee0772007-11-27 23:48:05 +000013647 new = PyDict_New();
13648 if (!new)
13649 return NULL;
13650 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013651 int x_kind, y_kind, z_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013652 const void *x_data, *y_data, *z_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013653
Georg Brandlceee0772007-11-27 23:48:05 +000013654 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013655 if (!PyUnicode_Check(x)) {
13656 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13657 "be a string if there is a second argument");
13658 goto err;
13659 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013660 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013661 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13662 "arguments must have equal length");
13663 goto err;
13664 }
13665 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013666 x_kind = PyUnicode_KIND(x);
13667 y_kind = PyUnicode_KIND(y);
13668 x_data = PyUnicode_DATA(x);
13669 y_data = PyUnicode_DATA(y);
13670 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13671 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013672 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013673 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013674 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013675 if (!value) {
13676 Py_DECREF(key);
13677 goto err;
13678 }
Georg Brandlceee0772007-11-27 23:48:05 +000013679 res = PyDict_SetItem(new, key, value);
13680 Py_DECREF(key);
13681 Py_DECREF(value);
13682 if (res < 0)
13683 goto err;
13684 }
13685 /* create entries for deleting chars in z */
13686 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013687 z_kind = PyUnicode_KIND(z);
13688 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013689 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013690 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013691 if (!key)
13692 goto err;
13693 res = PyDict_SetItem(new, key, Py_None);
13694 Py_DECREF(key);
13695 if (res < 0)
13696 goto err;
13697 }
13698 }
13699 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013700 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013701 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013702
Georg Brandlceee0772007-11-27 23:48:05 +000013703 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013704 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013705 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13706 "to maketrans it must be a dict");
13707 goto err;
13708 }
13709 /* copy entries into the new dict, converting string keys to int keys */
13710 while (PyDict_Next(x, &i, &key, &value)) {
13711 if (PyUnicode_Check(key)) {
13712 /* convert string keys to integer keys */
13713 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013714 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013715 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13716 "table must be of length 1");
13717 goto err;
13718 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013719 kind = PyUnicode_KIND(key);
13720 data = PyUnicode_DATA(key);
13721 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013722 if (!newkey)
13723 goto err;
13724 res = PyDict_SetItem(new, newkey, value);
13725 Py_DECREF(newkey);
13726 if (res < 0)
13727 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013728 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013729 /* just keep integer keys */
13730 if (PyDict_SetItem(new, key, value) < 0)
13731 goto err;
13732 } else {
13733 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13734 "be strings or integers");
13735 goto err;
13736 }
13737 }
13738 }
13739 return new;
13740 err:
13741 Py_DECREF(new);
13742 return NULL;
13743}
13744
INADA Naoki3ae20562017-01-16 20:41:20 +090013745/*[clinic input]
13746str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013747
INADA Naoki3ae20562017-01-16 20:41:20 +090013748 table: object
13749 Translation table, which must be a mapping of Unicode ordinals to
13750 Unicode ordinals, strings, or None.
13751 /
13752
13753Replace each character in the string using the given translation table.
13754
13755The table must implement lookup/indexing via __getitem__, for instance a
13756dictionary or list. If this operation raises LookupError, the character is
13757left untouched. Characters mapped to None are deleted.
13758[clinic start generated code]*/
13759
13760static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013761unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013762/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013763{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013764 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013765}
13766
INADA Naoki3ae20562017-01-16 20:41:20 +090013767/*[clinic input]
13768str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013769
INADA Naoki3ae20562017-01-16 20:41:20 +090013770Return a copy of the string converted to uppercase.
13771[clinic start generated code]*/
13772
13773static PyObject *
13774unicode_upper_impl(PyObject *self)
13775/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013776{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013777 if (PyUnicode_READY(self) == -1)
13778 return NULL;
13779 if (PyUnicode_IS_ASCII(self))
13780 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013781 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013782}
13783
INADA Naoki3ae20562017-01-16 20:41:20 +090013784/*[clinic input]
13785str.zfill as unicode_zfill
13786
13787 width: Py_ssize_t
13788 /
13789
13790Pad a numeric string with zeros on the left, to fill a field of the given width.
13791
13792The string is never truncated.
13793[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013794
13795static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013796unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013797/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013798{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013799 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013800 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013801 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013802 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013803 Py_UCS4 chr;
13804
Benjamin Petersonbac79492012-01-14 13:34:47 -050013805 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013806 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013807
Victor Stinnerc4b49542011-12-11 22:44:26 +010013808 if (PyUnicode_GET_LENGTH(self) >= width)
13809 return unicode_result_unchanged(self);
13810
13811 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013812
13813 u = pad(self, fill, 0, '0');
13814
Walter Dörwald068325e2002-04-15 13:36:47 +000013815 if (u == NULL)
13816 return NULL;
13817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013818 kind = PyUnicode_KIND(u);
13819 data = PyUnicode_DATA(u);
13820 chr = PyUnicode_READ(kind, data, fill);
13821
13822 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013823 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013824 PyUnicode_WRITE(kind, data, 0, chr);
13825 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013826 }
13827
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013828 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013829 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013830}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013831
13832#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013833static PyObject *
13834unicode__decimal2ascii(PyObject *self)
13835{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013836 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013837}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013838#endif
13839
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013840PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013841 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013842\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013843Return True if S starts with the specified prefix, False otherwise.\n\
13844With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013845With optional end, stop comparing S at that position.\n\
13846prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013847
13848static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013849unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013850 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013851{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013852 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013853 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013854 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013855 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013856 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013857
Jesus Ceaac451502011-04-20 17:09:23 +020013858 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013859 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013860 if (PyTuple_Check(subobj)) {
13861 Py_ssize_t i;
13862 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013863 substring = PyTuple_GET_ITEM(subobj, i);
13864 if (!PyUnicode_Check(substring)) {
13865 PyErr_Format(PyExc_TypeError,
13866 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013867 "not %.100s",
13868 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013869 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013870 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013871 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013872 if (result == -1)
13873 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013874 if (result) {
13875 Py_RETURN_TRUE;
13876 }
13877 }
13878 /* nothing matched */
13879 Py_RETURN_FALSE;
13880 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013881 if (!PyUnicode_Check(subobj)) {
13882 PyErr_Format(PyExc_TypeError,
13883 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013884 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013885 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013886 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013887 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013888 if (result == -1)
13889 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013890 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013891}
13892
13893
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013894PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013895 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013896\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013897Return True if S ends with the specified suffix, False otherwise.\n\
13898With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013899With optional end, stop comparing S at that position.\n\
13900suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013901
13902static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013903unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013904 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013905{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013906 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013907 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013908 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013909 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013910 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013911
Jesus Ceaac451502011-04-20 17:09:23 +020013912 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013913 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013914 if (PyTuple_Check(subobj)) {
13915 Py_ssize_t i;
13916 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013917 substring = PyTuple_GET_ITEM(subobj, i);
13918 if (!PyUnicode_Check(substring)) {
13919 PyErr_Format(PyExc_TypeError,
13920 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013921 "not %.100s",
13922 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013923 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013924 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013925 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013926 if (result == -1)
13927 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013928 if (result) {
13929 Py_RETURN_TRUE;
13930 }
13931 }
13932 Py_RETURN_FALSE;
13933 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013934 if (!PyUnicode_Check(subobj)) {
13935 PyErr_Format(PyExc_TypeError,
13936 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013937 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013938 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013939 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013940 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013941 if (result == -1)
13942 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013943 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013944}
13945
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013946static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013947_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013948{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013949 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13950 writer->data = PyUnicode_DATA(writer->buffer);
13951
13952 if (!writer->readonly) {
13953 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013954 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013955 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013956 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013957 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13958 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13959 writer->kind = PyUnicode_WCHAR_KIND;
13960 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13961
Victor Stinner8f674cc2013-04-17 23:02:17 +020013962 /* Copy-on-write mode: set buffer size to 0 so
13963 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13964 * next write. */
13965 writer->size = 0;
13966 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013967}
13968
Victor Stinnerd3f08822012-05-29 12:57:52 +020013969void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013970_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013971{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013972 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013973
13974 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013975 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013976
13977 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13978 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13979 writer->kind = PyUnicode_WCHAR_KIND;
13980 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013981}
13982
Inada Naoki770847a2019-06-24 12:30:24 +090013983// Initialize _PyUnicodeWriter with initial buffer
13984static inline void
13985_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13986{
13987 memset(writer, 0, sizeof(*writer));
13988 writer->buffer = buffer;
13989 _PyUnicodeWriter_Update(writer);
13990 writer->min_length = writer->size;
13991}
13992
Victor Stinnerd3f08822012-05-29 12:57:52 +020013993int
13994_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13995 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013996{
13997 Py_ssize_t newlen;
13998 PyObject *newbuffer;
13999
Victor Stinner2740e462016-09-06 16:58:36 -070014000 assert(maxchar <= MAX_UNICODE);
14001
Victor Stinnerca9381e2015-09-22 00:58:32 +020014002 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020014003 assert((maxchar > writer->maxchar && length >= 0)
14004 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014005
Victor Stinner202fdca2012-05-07 12:47:02 +020014006 if (length > PY_SSIZE_T_MAX - writer->pos) {
14007 PyErr_NoMemory();
14008 return -1;
14009 }
14010 newlen = writer->pos + length;
14011
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014012 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020014013
Victor Stinnerd3f08822012-05-29 12:57:52 +020014014 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020014015 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010014016 if (writer->overallocate
14017 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
14018 /* overallocate to limit the number of realloc() */
14019 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014020 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014021 if (newlen < writer->min_length)
14022 newlen = writer->min_length;
14023
Victor Stinnerd3f08822012-05-29 12:57:52 +020014024 writer->buffer = PyUnicode_New(newlen, maxchar);
14025 if (writer->buffer == NULL)
14026 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014027 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014028 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010014029 if (writer->overallocate
14030 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
14031 /* overallocate to limit the number of realloc() */
14032 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014033 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014034 if (newlen < writer->min_length)
14035 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014036
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014037 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020014038 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030014039 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020014040 newbuffer = PyUnicode_New(newlen, maxchar);
14041 if (newbuffer == NULL)
14042 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014043 _PyUnicode_FastCopyCharacters(newbuffer, 0,
14044 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020014045 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014046 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020014047 }
14048 else {
14049 newbuffer = resize_compact(writer->buffer, newlen);
14050 if (newbuffer == NULL)
14051 return -1;
14052 }
14053 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020014054 }
14055 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014056 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014057 newbuffer = PyUnicode_New(writer->size, maxchar);
14058 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020014059 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014060 _PyUnicode_FastCopyCharacters(newbuffer, 0,
14061 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030014062 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020014063 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014064 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020014065 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010014066
14067#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020014068}
14069
Victor Stinnerca9381e2015-09-22 00:58:32 +020014070int
14071_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
14072 enum PyUnicode_Kind kind)
14073{
14074 Py_UCS4 maxchar;
14075
14076 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
14077 assert(writer->kind < kind);
14078
14079 switch (kind)
14080 {
14081 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
14082 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
Victor Stinner99768342021-03-17 21:46:53 +010014083 case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
Victor Stinnerca9381e2015-09-22 00:58:32 +020014084 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014085 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020014086 }
14087
14088 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
14089}
14090
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070014091static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014092_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020014093{
Victor Stinner2740e462016-09-06 16:58:36 -070014094 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020014095 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
14096 return -1;
14097 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
14098 writer->pos++;
14099 return 0;
14100}
14101
14102int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014103_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
14104{
14105 return _PyUnicodeWriter_WriteCharInline(writer, ch);
14106}
14107
14108int
Victor Stinnerd3f08822012-05-29 12:57:52 +020014109_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
14110{
14111 Py_UCS4 maxchar;
14112 Py_ssize_t len;
14113
14114 if (PyUnicode_READY(str) == -1)
14115 return -1;
14116 len = PyUnicode_GET_LENGTH(str);
14117 if (len == 0)
14118 return 0;
14119 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
14120 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014121 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010014122 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020014123 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014124 Py_INCREF(str);
14125 writer->buffer = str;
14126 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014127 writer->pos += len;
14128 return 0;
14129 }
14130 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
14131 return -1;
14132 }
14133 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14134 str, 0, len);
14135 writer->pos += len;
14136 return 0;
14137}
14138
Victor Stinnere215d962012-10-06 23:03:36 +020014139int
Victor Stinnercfc4c132013-04-03 01:48:39 +020014140_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
14141 Py_ssize_t start, Py_ssize_t end)
14142{
14143 Py_UCS4 maxchar;
14144 Py_ssize_t len;
14145
14146 if (PyUnicode_READY(str) == -1)
14147 return -1;
14148
14149 assert(0 <= start);
14150 assert(end <= PyUnicode_GET_LENGTH(str));
14151 assert(start <= end);
14152
14153 if (end == 0)
14154 return 0;
14155
14156 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
14157 return _PyUnicodeWriter_WriteStr(writer, str);
14158
14159 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
14160 maxchar = _PyUnicode_FindMaxChar(str, start, end);
14161 else
14162 maxchar = writer->maxchar;
14163 len = end - start;
14164
14165 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
14166 return -1;
14167
14168 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14169 str, start, len);
14170 writer->pos += len;
14171 return 0;
14172}
14173
14174int
Victor Stinner4a587072013-11-19 12:54:53 +010014175_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14176 const char *ascii, Py_ssize_t len)
14177{
14178 if (len == -1)
14179 len = strlen(ascii);
14180
Andy Lestere6be9b52020-02-11 20:28:35 -060014181 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010014182
14183 if (writer->buffer == NULL && !writer->overallocate) {
14184 PyObject *str;
14185
14186 str = _PyUnicode_FromASCII(ascii, len);
14187 if (str == NULL)
14188 return -1;
14189
14190 writer->readonly = 1;
14191 writer->buffer = str;
14192 _PyUnicodeWriter_Update(writer);
14193 writer->pos += len;
14194 return 0;
14195 }
14196
14197 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14198 return -1;
14199
14200 switch (writer->kind)
14201 {
14202 case PyUnicode_1BYTE_KIND:
14203 {
14204 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14205 Py_UCS1 *data = writer->data;
14206
Christian Heimesf051e432016-09-13 20:22:02 +020014207 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010014208 break;
14209 }
14210 case PyUnicode_2BYTE_KIND:
14211 {
14212 _PyUnicode_CONVERT_BYTES(
14213 Py_UCS1, Py_UCS2,
14214 ascii, ascii + len,
14215 (Py_UCS2 *)writer->data + writer->pos);
14216 break;
14217 }
14218 case PyUnicode_4BYTE_KIND:
14219 {
14220 _PyUnicode_CONVERT_BYTES(
14221 Py_UCS1, Py_UCS4,
14222 ascii, ascii + len,
14223 (Py_UCS4 *)writer->data + writer->pos);
14224 break;
14225 }
14226 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014227 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010014228 }
14229
14230 writer->pos += len;
14231 return 0;
14232}
14233
14234int
14235_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14236 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020014237{
14238 Py_UCS4 maxchar;
14239
Andy Lestere6be9b52020-02-11 20:28:35 -060014240 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020014241 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14242 return -1;
14243 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14244 writer->pos += len;
14245 return 0;
14246}
14247
Victor Stinnerd3f08822012-05-29 12:57:52 +020014248PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014249_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014250{
Victor Stinner15a0bd32013-07-08 22:29:55 +020014251 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014252
Victor Stinnerd3f08822012-05-29 12:57:52 +020014253 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014254 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020014255 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020014256 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014257
14258 str = writer->buffer;
14259 writer->buffer = NULL;
14260
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014261 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014262 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14263 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014264 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014265
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014266 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14267 PyObject *str2;
14268 str2 = resize_compact(str, writer->pos);
14269 if (str2 == NULL) {
14270 Py_DECREF(str);
14271 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014272 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014273 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014274 }
14275
Victor Stinner15a0bd32013-07-08 22:29:55 +020014276 assert(_PyUnicode_CheckConsistency(str, 1));
14277 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020014278}
14279
Victor Stinnerd3f08822012-05-29 12:57:52 +020014280void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014281_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014282{
14283 Py_CLEAR(writer->buffer);
14284}
14285
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014286#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000014287
14288PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000014289 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000014290\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014291Return a formatted version of S, using substitutions from args and kwargs.\n\
14292The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000014293
Eric Smith27bbca62010-11-04 17:06:58 +000014294PyDoc_STRVAR(format_map__doc__,
14295 "S.format_map(mapping) -> str\n\
14296\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014297Return a formatted version of S, using substitutions from mapping.\n\
14298The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000014299
INADA Naoki3ae20562017-01-16 20:41:20 +090014300/*[clinic input]
14301str.__format__ as unicode___format__
14302
14303 format_spec: unicode
14304 /
14305
14306Return a formatted version of the string as described by format_spec.
14307[clinic start generated code]*/
14308
Eric Smith4a7d76d2008-05-30 18:10:19 +000014309static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014310unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090014311/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000014312{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014313 _PyUnicodeWriter writer;
14314 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000014315
Victor Stinnerd3f08822012-05-29 12:57:52 +020014316 if (PyUnicode_READY(self) == -1)
14317 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020014318 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014319 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14320 self, format_spec, 0,
14321 PyUnicode_GET_LENGTH(format_spec));
14322 if (ret == -1) {
14323 _PyUnicodeWriter_Dealloc(&writer);
14324 return NULL;
14325 }
14326 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000014327}
14328
INADA Naoki3ae20562017-01-16 20:41:20 +090014329/*[clinic input]
14330str.__sizeof__ as unicode_sizeof
14331
14332Return the size of the string in memory, in bytes.
14333[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014334
14335static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014336unicode_sizeof_impl(PyObject *self)
14337/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014338{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014339 Py_ssize_t size;
14340
14341 /* If it's a compact object, account for base structure +
14342 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014343 if (PyUnicode_IS_COMPACT_ASCII(self))
14344 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14345 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014346 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014347 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014348 else {
14349 /* If it is a two-block object, account for base object, and
14350 for character block if present. */
14351 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014352 if (_PyUnicode_DATA_ANY(self))
14353 size += (PyUnicode_GET_LENGTH(self) + 1) *
14354 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014355 }
14356 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014357 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014358 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14359 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14360 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14361 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014362
14363 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014364}
14365
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014366static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014367unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014368{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014369 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014370 if (!copy)
14371 return NULL;
14372 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014373}
14374
Guido van Rossumd57fd912000-03-10 22:53:23 +000014375static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014376 UNICODE_ENCODE_METHODDEF
14377 UNICODE_REPLACE_METHODDEF
14378 UNICODE_SPLIT_METHODDEF
14379 UNICODE_RSPLIT_METHODDEF
14380 UNICODE_JOIN_METHODDEF
14381 UNICODE_CAPITALIZE_METHODDEF
14382 UNICODE_CASEFOLD_METHODDEF
14383 UNICODE_TITLE_METHODDEF
14384 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014385 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014386 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014387 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014388 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014389 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014390 UNICODE_LJUST_METHODDEF
14391 UNICODE_LOWER_METHODDEF
14392 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014393 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14394 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014395 UNICODE_RJUST_METHODDEF
14396 UNICODE_RSTRIP_METHODDEF
14397 UNICODE_RPARTITION_METHODDEF
14398 UNICODE_SPLITLINES_METHODDEF
14399 UNICODE_STRIP_METHODDEF
14400 UNICODE_SWAPCASE_METHODDEF
14401 UNICODE_TRANSLATE_METHODDEF
14402 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014403 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14404 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
sweeneydea81849b2020-04-22 17:05:48 -040014405 UNICODE_REMOVEPREFIX_METHODDEF
14406 UNICODE_REMOVESUFFIX_METHODDEF
INADA Naokia49ac992018-01-27 14:06:21 +090014407 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014408 UNICODE_ISLOWER_METHODDEF
14409 UNICODE_ISUPPER_METHODDEF
14410 UNICODE_ISTITLE_METHODDEF
14411 UNICODE_ISSPACE_METHODDEF
14412 UNICODE_ISDECIMAL_METHODDEF
14413 UNICODE_ISDIGIT_METHODDEF
14414 UNICODE_ISNUMERIC_METHODDEF
14415 UNICODE_ISALPHA_METHODDEF
14416 UNICODE_ISALNUM_METHODDEF
14417 UNICODE_ISIDENTIFIER_METHODDEF
14418 UNICODE_ISPRINTABLE_METHODDEF
14419 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014420 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014421 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014422 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014423 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014424 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014425#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014426 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014427 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014428#endif
14429
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014430 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014431 {NULL, NULL}
14432};
14433
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014434static PyObject *
14435unicode_mod(PyObject *v, PyObject *w)
14436{
Brian Curtindfc80e32011-08-10 20:28:54 -050014437 if (!PyUnicode_Check(v))
14438 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014439 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014440}
14441
14442static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014443 0, /*nb_add*/
14444 0, /*nb_subtract*/
14445 0, /*nb_multiply*/
14446 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014447};
14448
Guido van Rossumd57fd912000-03-10 22:53:23 +000014449static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014450 (lenfunc) unicode_length, /* sq_length */
14451 PyUnicode_Concat, /* sq_concat */
14452 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14453 (ssizeargfunc) unicode_getitem, /* sq_item */
14454 0, /* sq_slice */
14455 0, /* sq_ass_item */
14456 0, /* sq_ass_slice */
14457 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014458};
14459
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014460static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014461unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014462{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014463 if (PyUnicode_READY(self) == -1)
14464 return NULL;
14465
Victor Stinnera15e2602020-04-08 02:01:56 +020014466 if (_PyIndex_Check(item)) {
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014467 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014468 if (i == -1 && PyErr_Occurred())
14469 return NULL;
14470 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014471 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014472 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014473 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014474 Py_ssize_t start, stop, step, slicelength, i;
14475 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014476 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014477 const void *src_data;
14478 void *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014479 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014480 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014481
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014482 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014483 return NULL;
14484 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014485 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14486 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014487
14488 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014489 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014490 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014491 slicelength == PyUnicode_GET_LENGTH(self)) {
14492 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014493 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014494 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014495 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014496 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014497 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014498 src_kind = PyUnicode_KIND(self);
14499 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014500 if (!PyUnicode_IS_ASCII(self)) {
14501 kind_limit = kind_maxchar_limit(src_kind);
14502 max_char = 0;
14503 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14504 ch = PyUnicode_READ(src_kind, src_data, cur);
14505 if (ch > max_char) {
14506 max_char = ch;
14507 if (max_char >= kind_limit)
14508 break;
14509 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014510 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014511 }
Victor Stinner55c99112011-10-13 01:17:06 +020014512 else
14513 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014514 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014515 if (result == NULL)
14516 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014517 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014518 dest_data = PyUnicode_DATA(result);
14519
14520 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014521 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14522 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014523 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014524 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014525 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014526 } else {
14527 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14528 return NULL;
14529 }
14530}
14531
14532static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014533 (lenfunc)unicode_length, /* mp_length */
14534 (binaryfunc)unicode_subscript, /* mp_subscript */
14535 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014536};
14537
Guido van Rossumd57fd912000-03-10 22:53:23 +000014538
Guido van Rossumd57fd912000-03-10 22:53:23 +000014539/* Helpers for PyUnicode_Format() */
14540
Victor Stinnera47082312012-10-04 02:19:54 +020014541struct unicode_formatter_t {
14542 PyObject *args;
14543 int args_owned;
14544 Py_ssize_t arglen, argidx;
14545 PyObject *dict;
14546
14547 enum PyUnicode_Kind fmtkind;
14548 Py_ssize_t fmtcnt, fmtpos;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014549 const void *fmtdata;
Victor Stinnera47082312012-10-04 02:19:54 +020014550 PyObject *fmtstr;
14551
14552 _PyUnicodeWriter writer;
14553};
14554
14555struct unicode_format_arg_t {
14556 Py_UCS4 ch;
14557 int flags;
14558 Py_ssize_t width;
14559 int prec;
14560 int sign;
14561};
14562
Guido van Rossumd57fd912000-03-10 22:53:23 +000014563static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014564unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014565{
Victor Stinnera47082312012-10-04 02:19:54 +020014566 Py_ssize_t argidx = ctx->argidx;
14567
14568 if (argidx < ctx->arglen) {
14569 ctx->argidx++;
14570 if (ctx->arglen < 0)
14571 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014572 else
Victor Stinnera47082312012-10-04 02:19:54 +020014573 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014574 }
14575 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014576 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014577 return NULL;
14578}
14579
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014580/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014581
Victor Stinnera47082312012-10-04 02:19:54 +020014582/* Format a float into the writer if the writer is not NULL, or into *p_output
14583 otherwise.
14584
14585 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014586static int
Victor Stinnera47082312012-10-04 02:19:54 +020014587formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14588 PyObject **p_output,
14589 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014590{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014591 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014592 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014593 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014594 int prec;
14595 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014596
Guido van Rossumd57fd912000-03-10 22:53:23 +000014597 x = PyFloat_AsDouble(v);
14598 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014599 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014600
Victor Stinnera47082312012-10-04 02:19:54 +020014601 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014602 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014603 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014604
Victor Stinnera47082312012-10-04 02:19:54 +020014605 if (arg->flags & F_ALT)
14606 dtoa_flags = Py_DTSF_ALT;
14607 else
14608 dtoa_flags = 0;
14609 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014610 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014611 return -1;
14612 len = strlen(p);
14613 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014614 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014615 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014616 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014617 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014618 }
14619 else
14620 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014621 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014622 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014623}
14624
Victor Stinnerd0880d52012-04-27 23:40:13 +020014625/* formatlong() emulates the format codes d, u, o, x and X, and
14626 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14627 * Python's regular ints.
14628 * Return value: a new PyUnicodeObject*, or NULL if error.
14629 * The output string is of the form
14630 * "-"? ("0x" | "0X")? digit+
14631 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14632 * set in flags. The case of hex digits will be correct,
14633 * There will be at least prec digits, zero-filled on the left if
14634 * necessary to get that many.
14635 * val object to be converted
14636 * flags bitmask of format flags; only F_ALT is looked at
14637 * prec minimum number of digits; 0-fill on left if needed
14638 * type a character in [duoxX]; u acts the same as d
14639 *
14640 * CAUTION: o, x and X conversions on regular ints can never
14641 * produce a '-' sign, but can for Python's unbounded ints.
14642 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014643PyObject *
14644_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014645{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014646 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014647 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014648 Py_ssize_t i;
14649 int sign; /* 1 if '-', else 0 */
14650 int len; /* number of characters */
14651 Py_ssize_t llen;
14652 int numdigits; /* len == numnondigits + numdigits */
14653 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014654
Victor Stinnerd0880d52012-04-27 23:40:13 +020014655 /* Avoid exceeding SSIZE_T_MAX */
14656 if (prec > INT_MAX-3) {
14657 PyErr_SetString(PyExc_OverflowError,
14658 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014659 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014660 }
14661
14662 assert(PyLong_Check(val));
14663
14664 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014665 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014666 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014667 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014668 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014669 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014670 /* int and int subclasses should print numerically when a numeric */
14671 /* format code is used (see issue18780) */
14672 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014673 break;
14674 case 'o':
14675 numnondigits = 2;
14676 result = PyNumber_ToBase(val, 8);
14677 break;
14678 case 'x':
14679 case 'X':
14680 numnondigits = 2;
14681 result = PyNumber_ToBase(val, 16);
14682 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014683 }
14684 if (!result)
14685 return NULL;
14686
14687 assert(unicode_modifiable(result));
14688 assert(PyUnicode_IS_READY(result));
14689 assert(PyUnicode_IS_ASCII(result));
14690
14691 /* To modify the string in-place, there can only be one reference. */
14692 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014693 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014694 PyErr_BadInternalCall();
14695 return NULL;
14696 }
14697 buf = PyUnicode_DATA(result);
14698 llen = PyUnicode_GET_LENGTH(result);
14699 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014700 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014701 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014702 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014703 return NULL;
14704 }
14705 len = (int)llen;
14706 sign = buf[0] == '-';
14707 numnondigits += sign;
14708 numdigits = len - numnondigits;
14709 assert(numdigits > 0);
14710
14711 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014712 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014713 (type == 'o' || type == 'x' || type == 'X'))) {
14714 assert(buf[sign] == '0');
14715 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14716 buf[sign+1] == 'o');
14717 numnondigits -= 2;
14718 buf += 2;
14719 len -= 2;
14720 if (sign)
14721 buf[0] = '-';
14722 assert(len == numnondigits + numdigits);
14723 assert(numdigits > 0);
14724 }
14725
14726 /* Fill with leading zeroes to meet minimum width. */
14727 if (prec > numdigits) {
14728 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14729 numnondigits + prec);
14730 char *b1;
14731 if (!r1) {
14732 Py_DECREF(result);
14733 return NULL;
14734 }
14735 b1 = PyBytes_AS_STRING(r1);
14736 for (i = 0; i < numnondigits; ++i)
14737 *b1++ = *buf++;
14738 for (i = 0; i < prec - numdigits; i++)
14739 *b1++ = '0';
14740 for (i = 0; i < numdigits; i++)
14741 *b1++ = *buf++;
14742 *b1 = '\0';
14743 Py_DECREF(result);
14744 result = r1;
14745 buf = PyBytes_AS_STRING(result);
14746 len = numnondigits + prec;
14747 }
14748
14749 /* Fix up case for hex conversions. */
14750 if (type == 'X') {
14751 /* Need to convert all lower case letters to upper case.
14752 and need to convert 0x to 0X (and -0x to -0X). */
14753 for (i = 0; i < len; i++)
14754 if (buf[i] >= 'a' && buf[i] <= 'x')
14755 buf[i] -= 'a'-'A';
14756 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014757 if (!PyUnicode_Check(result)
14758 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014759 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014760 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014761 Py_DECREF(result);
14762 result = unicode;
14763 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014764 else if (len != PyUnicode_GET_LENGTH(result)) {
14765 if (PyUnicode_Resize(&result, len) < 0)
14766 Py_CLEAR(result);
14767 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014768 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014769}
14770
Ethan Furmandf3ed242014-01-05 06:50:30 -080014771/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014772 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014773 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014774 * -1 and raise an exception on error */
14775static int
Victor Stinnera47082312012-10-04 02:19:54 +020014776mainformatlong(PyObject *v,
14777 struct unicode_format_arg_t *arg,
14778 PyObject **p_output,
14779 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014780{
14781 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014782 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014783
14784 if (!PyNumber_Check(v))
14785 goto wrongtype;
14786
Ethan Furman9ab74802014-03-21 06:38:46 -070014787 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014788 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014789 if (type == 'o' || type == 'x' || type == 'X') {
Serhiy Storchaka5f4b229d2020-05-28 10:33:45 +030014790 iobj = _PyNumber_Index(v);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014791 }
14792 else {
14793 iobj = PyNumber_Long(v);
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014794 }
14795 if (iobj == NULL ) {
14796 if (PyErr_ExceptionMatches(PyExc_TypeError))
14797 goto wrongtype;
14798 return -1;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014799 }
14800 assert(PyLong_Check(iobj));
14801 }
14802 else {
14803 iobj = v;
14804 Py_INCREF(iobj);
14805 }
14806
14807 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014808 && arg->width == -1 && arg->prec == -1
14809 && !(arg->flags & (F_SIGN | F_BLANK))
14810 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014811 {
14812 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014813 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014814 int base;
14815
Victor Stinnera47082312012-10-04 02:19:54 +020014816 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014817 {
14818 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014819 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014820 case 'd':
14821 case 'i':
14822 case 'u':
14823 base = 10;
14824 break;
14825 case 'o':
14826 base = 8;
14827 break;
14828 case 'x':
14829 case 'X':
14830 base = 16;
14831 break;
14832 }
14833
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014834 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14835 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014836 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014837 }
14838 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014839 return 1;
14840 }
14841
Ethan Furmanb95b5612015-01-23 20:05:18 -080014842 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014843 Py_DECREF(iobj);
14844 if (res == NULL)
14845 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014846 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014847 return 0;
14848
14849wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014850 switch(type)
14851 {
14852 case 'o':
14853 case 'x':
14854 case 'X':
14855 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014856 "%%%c format: an integer is required, "
14857 "not %.200s",
14858 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014859 break;
14860 default:
14861 PyErr_Format(PyExc_TypeError,
Serhiy Storchakae2ec0b22020-10-09 14:14:37 +030014862 "%%%c format: a real number is required, "
Victor Stinner998b8062018-09-12 00:23:25 +020014863 "not %.200s",
14864 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014865 break;
14866 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014867 return -1;
14868}
14869
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014870static Py_UCS4
14871formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014872{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014873 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014874 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014875 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014876 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014877 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014878 goto onError;
14879 }
14880 else {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014881 int overflow;
14882 long x = PyLong_AsLongAndOverflow(v, &overflow);
14883 if (x == -1 && PyErr_Occurred()) {
14884 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014885 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014886 }
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014887 return (Py_UCS4) -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014888 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014889
Victor Stinner8faf8212011-12-08 22:14:11 +010014890 if (x < 0 || x > MAX_UNICODE) {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014891 /* this includes an overflow in converting to C long */
Benjamin Peterson29060642009-01-31 22:14:21 +000014892 PyErr_SetString(PyExc_OverflowError,
14893 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014894 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014895 }
14896
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014897 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014898 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014899
Benjamin Peterson29060642009-01-31 22:14:21 +000014900 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014901 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014902 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014903 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014904}
14905
Victor Stinnera47082312012-10-04 02:19:54 +020014906/* Parse options of an argument: flags, width, precision.
14907 Handle also "%(name)" syntax.
14908
14909 Return 0 if the argument has been formatted into arg->str.
14910 Return 1 if the argument has been written into ctx->writer,
14911 Raise an exception and return -1 on error. */
14912static int
14913unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14914 struct unicode_format_arg_t *arg)
14915{
14916#define FORMAT_READ(ctx) \
14917 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14918
14919 PyObject *v;
14920
Victor Stinnera47082312012-10-04 02:19:54 +020014921 if (arg->ch == '(') {
14922 /* Get argument value from a dictionary. Example: "%(name)s". */
14923 Py_ssize_t keystart;
14924 Py_ssize_t keylen;
14925 PyObject *key;
14926 int pcount = 1;
14927
14928 if (ctx->dict == NULL) {
14929 PyErr_SetString(PyExc_TypeError,
14930 "format requires a mapping");
14931 return -1;
14932 }
14933 ++ctx->fmtpos;
14934 --ctx->fmtcnt;
14935 keystart = ctx->fmtpos;
14936 /* Skip over balanced parentheses */
14937 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14938 arg->ch = FORMAT_READ(ctx);
14939 if (arg->ch == ')')
14940 --pcount;
14941 else if (arg->ch == '(')
14942 ++pcount;
14943 ctx->fmtpos++;
14944 }
14945 keylen = ctx->fmtpos - keystart - 1;
14946 if (ctx->fmtcnt < 0 || pcount > 0) {
14947 PyErr_SetString(PyExc_ValueError,
14948 "incomplete format key");
14949 return -1;
14950 }
14951 key = PyUnicode_Substring(ctx->fmtstr,
14952 keystart, keystart + keylen);
14953 if (key == NULL)
14954 return -1;
14955 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014956 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014957 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014958 }
14959 ctx->args = PyObject_GetItem(ctx->dict, key);
14960 Py_DECREF(key);
14961 if (ctx->args == NULL)
14962 return -1;
14963 ctx->args_owned = 1;
14964 ctx->arglen = -1;
14965 ctx->argidx = -2;
14966 }
14967
14968 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014969 while (--ctx->fmtcnt >= 0) {
14970 arg->ch = FORMAT_READ(ctx);
14971 ctx->fmtpos++;
14972 switch (arg->ch) {
14973 case '-': arg->flags |= F_LJUST; continue;
14974 case '+': arg->flags |= F_SIGN; continue;
14975 case ' ': arg->flags |= F_BLANK; continue;
14976 case '#': arg->flags |= F_ALT; continue;
14977 case '0': arg->flags |= F_ZERO; continue;
14978 }
14979 break;
14980 }
14981
14982 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014983 if (arg->ch == '*') {
14984 v = unicode_format_getnextarg(ctx);
14985 if (v == NULL)
14986 return -1;
14987 if (!PyLong_Check(v)) {
14988 PyErr_SetString(PyExc_TypeError,
14989 "* wants int");
14990 return -1;
14991 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014992 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014993 if (arg->width == -1 && PyErr_Occurred())
14994 return -1;
14995 if (arg->width < 0) {
14996 arg->flags |= F_LJUST;
14997 arg->width = -arg->width;
14998 }
14999 if (--ctx->fmtcnt >= 0) {
15000 arg->ch = FORMAT_READ(ctx);
15001 ctx->fmtpos++;
15002 }
15003 }
15004 else if (arg->ch >= '0' && arg->ch <= '9') {
15005 arg->width = arg->ch - '0';
15006 while (--ctx->fmtcnt >= 0) {
15007 arg->ch = FORMAT_READ(ctx);
15008 ctx->fmtpos++;
15009 if (arg->ch < '0' || arg->ch > '9')
15010 break;
15011 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
15012 mixing signed and unsigned comparison. Since arg->ch is between
15013 '0' and '9', casting to int is safe. */
15014 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
15015 PyErr_SetString(PyExc_ValueError,
15016 "width too big");
15017 return -1;
15018 }
15019 arg->width = arg->width*10 + (arg->ch - '0');
15020 }
15021 }
15022
15023 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020015024 if (arg->ch == '.') {
15025 arg->prec = 0;
15026 if (--ctx->fmtcnt >= 0) {
15027 arg->ch = FORMAT_READ(ctx);
15028 ctx->fmtpos++;
15029 }
15030 if (arg->ch == '*') {
15031 v = unicode_format_getnextarg(ctx);
15032 if (v == NULL)
15033 return -1;
15034 if (!PyLong_Check(v)) {
15035 PyErr_SetString(PyExc_TypeError,
15036 "* wants int");
15037 return -1;
15038 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020015039 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020015040 if (arg->prec == -1 && PyErr_Occurred())
15041 return -1;
15042 if (arg->prec < 0)
15043 arg->prec = 0;
15044 if (--ctx->fmtcnt >= 0) {
15045 arg->ch = FORMAT_READ(ctx);
15046 ctx->fmtpos++;
15047 }
15048 }
15049 else if (arg->ch >= '0' && arg->ch <= '9') {
15050 arg->prec = arg->ch - '0';
15051 while (--ctx->fmtcnt >= 0) {
15052 arg->ch = FORMAT_READ(ctx);
15053 ctx->fmtpos++;
15054 if (arg->ch < '0' || arg->ch > '9')
15055 break;
15056 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
15057 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020015058 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020015059 return -1;
15060 }
15061 arg->prec = arg->prec*10 + (arg->ch - '0');
15062 }
15063 }
15064 }
15065
15066 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
15067 if (ctx->fmtcnt >= 0) {
15068 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
15069 if (--ctx->fmtcnt >= 0) {
15070 arg->ch = FORMAT_READ(ctx);
15071 ctx->fmtpos++;
15072 }
15073 }
15074 }
15075 if (ctx->fmtcnt < 0) {
15076 PyErr_SetString(PyExc_ValueError,
15077 "incomplete format");
15078 return -1;
15079 }
15080 return 0;
15081
15082#undef FORMAT_READ
15083}
15084
15085/* Format one argument. Supported conversion specifiers:
15086
15087 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080015088 - "i", "d", "u": int or float
15089 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020015090 - "e", "E", "f", "F", "g", "G": float
15091 - "c": int or str (1 character)
15092
Victor Stinner8dbd4212012-12-04 09:30:24 +010015093 When possible, the output is written directly into the Unicode writer
15094 (ctx->writer). A string is created when padding is required.
15095
Victor Stinnera47082312012-10-04 02:19:54 +020015096 Return 0 if the argument has been formatted into *p_str,
15097 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010015098 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020015099static int
15100unicode_format_arg_format(struct unicode_formatter_t *ctx,
15101 struct unicode_format_arg_t *arg,
15102 PyObject **p_str)
15103{
15104 PyObject *v;
15105 _PyUnicodeWriter *writer = &ctx->writer;
15106
15107 if (ctx->fmtcnt == 0)
15108 ctx->writer.overallocate = 0;
15109
Victor Stinnera47082312012-10-04 02:19:54 +020015110 v = unicode_format_getnextarg(ctx);
15111 if (v == NULL)
15112 return -1;
15113
Victor Stinnera47082312012-10-04 02:19:54 +020015114
15115 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020015116 case 's':
15117 case 'r':
15118 case 'a':
15119 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
15120 /* Fast path */
15121 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
15122 return -1;
15123 return 1;
15124 }
15125
15126 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
15127 *p_str = v;
15128 Py_INCREF(*p_str);
15129 }
15130 else {
15131 if (arg->ch == 's')
15132 *p_str = PyObject_Str(v);
15133 else if (arg->ch == 'r')
15134 *p_str = PyObject_Repr(v);
15135 else
15136 *p_str = PyObject_ASCII(v);
15137 }
15138 break;
15139
15140 case 'i':
15141 case 'd':
15142 case 'u':
15143 case 'o':
15144 case 'x':
15145 case 'X':
15146 {
15147 int ret = mainformatlong(v, arg, p_str, writer);
15148 if (ret != 0)
15149 return ret;
15150 arg->sign = 1;
15151 break;
15152 }
15153
15154 case 'e':
15155 case 'E':
15156 case 'f':
15157 case 'F':
15158 case 'g':
15159 case 'G':
15160 if (arg->width == -1 && arg->prec == -1
15161 && !(arg->flags & (F_SIGN | F_BLANK)))
15162 {
15163 /* Fast path */
15164 if (formatfloat(v, arg, NULL, writer) == -1)
15165 return -1;
15166 return 1;
15167 }
15168
15169 arg->sign = 1;
15170 if (formatfloat(v, arg, p_str, NULL) == -1)
15171 return -1;
15172 break;
15173
15174 case 'c':
15175 {
15176 Py_UCS4 ch = formatchar(v);
15177 if (ch == (Py_UCS4) -1)
15178 return -1;
15179 if (arg->width == -1 && arg->prec == -1) {
15180 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020015181 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020015182 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020015183 return 1;
15184 }
15185 *p_str = PyUnicode_FromOrdinal(ch);
15186 break;
15187 }
15188
15189 default:
15190 PyErr_Format(PyExc_ValueError,
15191 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020015192 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020015193 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15194 (int)arg->ch,
15195 ctx->fmtpos - 1);
15196 return -1;
15197 }
15198 if (*p_str == NULL)
15199 return -1;
15200 assert (PyUnicode_Check(*p_str));
15201 return 0;
15202}
15203
15204static int
15205unicode_format_arg_output(struct unicode_formatter_t *ctx,
15206 struct unicode_format_arg_t *arg,
15207 PyObject *str)
15208{
15209 Py_ssize_t len;
15210 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015211 const void *pbuf;
Victor Stinnera47082312012-10-04 02:19:54 +020015212 Py_ssize_t pindex;
15213 Py_UCS4 signchar;
15214 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015215 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015216 Py_ssize_t sublen;
15217 _PyUnicodeWriter *writer = &ctx->writer;
15218 Py_UCS4 fill;
15219
15220 fill = ' ';
15221 if (arg->sign && arg->flags & F_ZERO)
15222 fill = '0';
15223
15224 if (PyUnicode_READY(str) == -1)
15225 return -1;
15226
15227 len = PyUnicode_GET_LENGTH(str);
15228 if ((arg->width == -1 || arg->width <= len)
15229 && (arg->prec == -1 || arg->prec >= len)
15230 && !(arg->flags & (F_SIGN | F_BLANK)))
15231 {
15232 /* Fast path */
15233 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15234 return -1;
15235 return 0;
15236 }
15237
15238 /* Truncate the string for "s", "r" and "a" formats
15239 if the precision is set */
15240 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15241 if (arg->prec >= 0 && len > arg->prec)
15242 len = arg->prec;
15243 }
15244
15245 /* Adjust sign and width */
15246 kind = PyUnicode_KIND(str);
15247 pbuf = PyUnicode_DATA(str);
15248 pindex = 0;
15249 signchar = '\0';
15250 if (arg->sign) {
15251 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15252 if (ch == '-' || ch == '+') {
15253 signchar = ch;
15254 len--;
15255 pindex++;
15256 }
15257 else if (arg->flags & F_SIGN)
15258 signchar = '+';
15259 else if (arg->flags & F_BLANK)
15260 signchar = ' ';
15261 else
15262 arg->sign = 0;
15263 }
15264 if (arg->width < len)
15265 arg->width = len;
15266
15267 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015268 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015269 if (!(arg->flags & F_LJUST)) {
15270 if (arg->sign) {
15271 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015272 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015273 }
15274 else {
15275 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015276 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015277 }
15278 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015279 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15280 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015281 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015282 }
15283
Victor Stinnera47082312012-10-04 02:19:54 +020015284 buflen = arg->width;
15285 if (arg->sign && len == arg->width)
15286 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015287 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020015288 return -1;
15289
15290 /* Write the sign if needed */
15291 if (arg->sign) {
15292 if (fill != ' ') {
15293 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15294 writer->pos += 1;
15295 }
15296 if (arg->width > len)
15297 arg->width--;
15298 }
15299
15300 /* Write the numeric prefix for "x", "X" and "o" formats
15301 if the alternate form is used.
15302 For example, write "0x" for the "%#x" format. */
15303 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15304 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15305 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15306 if (fill != ' ') {
15307 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15308 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15309 writer->pos += 2;
15310 pindex += 2;
15311 }
15312 arg->width -= 2;
15313 if (arg->width < 0)
15314 arg->width = 0;
15315 len -= 2;
15316 }
15317
15318 /* Pad left with the fill character if needed */
15319 if (arg->width > len && !(arg->flags & F_LJUST)) {
15320 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015321 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015322 writer->pos += sublen;
15323 arg->width = len;
15324 }
15325
15326 /* If padding with spaces: write sign if needed and/or numeric prefix if
15327 the alternate form is used */
15328 if (fill == ' ') {
15329 if (arg->sign) {
15330 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15331 writer->pos += 1;
15332 }
15333 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15334 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15335 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15336 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15337 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15338 writer->pos += 2;
15339 pindex += 2;
15340 }
15341 }
15342
15343 /* Write characters */
15344 if (len) {
15345 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15346 str, pindex, len);
15347 writer->pos += len;
15348 }
15349
15350 /* Pad right with the fill character if needed */
15351 if (arg->width > len) {
15352 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015353 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015354 writer->pos += sublen;
15355 }
15356 return 0;
15357}
15358
15359/* Helper of PyUnicode_Format(): format one arg.
15360 Return 0 on success, raise an exception and return -1 on error. */
15361static int
15362unicode_format_arg(struct unicode_formatter_t *ctx)
15363{
15364 struct unicode_format_arg_t arg;
15365 PyObject *str;
15366 int ret;
15367
Victor Stinner8dbd4212012-12-04 09:30:24 +010015368 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015369 if (arg.ch == '%') {
15370 ctx->fmtpos++;
15371 ctx->fmtcnt--;
15372 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15373 return -1;
15374 return 0;
15375 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015376 arg.flags = 0;
15377 arg.width = -1;
15378 arg.prec = -1;
15379 arg.sign = 0;
15380 str = NULL;
15381
Victor Stinnera47082312012-10-04 02:19:54 +020015382 ret = unicode_format_arg_parse(ctx, &arg);
15383 if (ret == -1)
15384 return -1;
15385
15386 ret = unicode_format_arg_format(ctx, &arg, &str);
15387 if (ret == -1)
15388 return -1;
15389
15390 if (ret != 1) {
15391 ret = unicode_format_arg_output(ctx, &arg, str);
15392 Py_DECREF(str);
15393 if (ret == -1)
15394 return -1;
15395 }
15396
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015397 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015398 PyErr_SetString(PyExc_TypeError,
15399 "not all arguments converted during string formatting");
15400 return -1;
15401 }
15402 return 0;
15403}
15404
Alexander Belopolsky40018472011-02-26 01:02:56 +000015405PyObject *
15406PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015407{
Victor Stinnera47082312012-10-04 02:19:54 +020015408 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015409
Guido van Rossumd57fd912000-03-10 22:53:23 +000015410 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015411 PyErr_BadInternalCall();
15412 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015413 }
Victor Stinnera47082312012-10-04 02:19:54 +020015414
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015415 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015416 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015417
15418 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015419 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15420 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15421 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15422 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015423
Victor Stinner8f674cc2013-04-17 23:02:17 +020015424 _PyUnicodeWriter_Init(&ctx.writer);
15425 ctx.writer.min_length = ctx.fmtcnt + 100;
15426 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015427
Guido van Rossumd57fd912000-03-10 22:53:23 +000015428 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015429 ctx.arglen = PyTuple_Size(args);
15430 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015431 }
15432 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015433 ctx.arglen = -1;
15434 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015435 }
Victor Stinnera47082312012-10-04 02:19:54 +020015436 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015437 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015438 ctx.dict = args;
15439 else
15440 ctx.dict = NULL;
15441 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015442
Victor Stinnera47082312012-10-04 02:19:54 +020015443 while (--ctx.fmtcnt >= 0) {
15444 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015445 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015446
15447 nonfmtpos = ctx.fmtpos++;
15448 while (ctx.fmtcnt >= 0 &&
15449 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15450 ctx.fmtpos++;
15451 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015452 }
Victor Stinnera47082312012-10-04 02:19:54 +020015453 if (ctx.fmtcnt < 0) {
15454 ctx.fmtpos--;
15455 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015456 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015457
Victor Stinnercfc4c132013-04-03 01:48:39 +020015458 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15459 nonfmtpos, ctx.fmtpos) < 0)
15460 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015461 }
15462 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015463 ctx.fmtpos++;
15464 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015465 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015466 }
15467 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015468
Victor Stinnera47082312012-10-04 02:19:54 +020015469 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015470 PyErr_SetString(PyExc_TypeError,
15471 "not all arguments converted during string formatting");
15472 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015473 }
15474
Victor Stinnera47082312012-10-04 02:19:54 +020015475 if (ctx.args_owned) {
15476 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015477 }
Victor Stinnera47082312012-10-04 02:19:54 +020015478 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015479
Benjamin Peterson29060642009-01-31 22:14:21 +000015480 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015481 _PyUnicodeWriter_Dealloc(&ctx.writer);
15482 if (ctx.args_owned) {
15483 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015484 }
15485 return NULL;
15486}
15487
Jeremy Hylton938ace62002-07-17 16:30:39 +000015488static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015489unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
15490
15491/*[clinic input]
15492@classmethod
15493str.__new__ as unicode_new
15494
15495 object as x: object = NULL
15496 encoding: str = NULL
15497 errors: str = NULL
15498
15499[clinic start generated code]*/
Guido van Rossume023fe02001-08-30 03:12:59 +000015500
Tim Peters6d6c1a32001-08-02 04:15:00 +000015501static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015502unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
15503 const char *errors)
15504/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
Tim Peters6d6c1a32001-08-02 04:15:00 +000015505{
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015506 PyObject *unicode;
15507 if (x == NULL) {
15508 unicode = unicode_new_empty();
15509 }
15510 else if (encoding == NULL && errors == NULL) {
15511 unicode = PyObject_Str(x);
15512 }
15513 else {
15514 unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
15515 }
Tim Peters6d6c1a32001-08-02 04:15:00 +000015516
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015517 if (unicode != NULL && type != &PyUnicode_Type) {
15518 Py_SETREF(unicode, unicode_subtype_new(type, unicode));
15519 }
15520 return unicode;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015521}
15522
Guido van Rossume023fe02001-08-30 03:12:59 +000015523static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015524unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
Guido van Rossume023fe02001-08-30 03:12:59 +000015525{
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015526 PyObject *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015527 Py_ssize_t length, char_size;
15528 int share_wstr, share_utf8;
15529 unsigned int kind;
15530 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015531
Benjamin Peterson14339b62009-01-31 16:36:08 +000015532 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner910337b2011-10-03 03:20:16 +020015533 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015534 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015535 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015536 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015537
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015538 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015539 if (self == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015540 return NULL;
15541 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015542 kind = PyUnicode_KIND(unicode);
15543 length = PyUnicode_GET_LENGTH(unicode);
15544
15545 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015546#ifdef Py_DEBUG
15547 _PyUnicode_HASH(self) = -1;
15548#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015549 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015550#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015551 _PyUnicode_STATE(self).interned = 0;
15552 _PyUnicode_STATE(self).kind = kind;
15553 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015554 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015555 _PyUnicode_STATE(self).ready = 1;
15556 _PyUnicode_WSTR(self) = NULL;
15557 _PyUnicode_UTF8_LENGTH(self) = 0;
15558 _PyUnicode_UTF8(self) = NULL;
15559 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015560 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015561
15562 share_utf8 = 0;
15563 share_wstr = 0;
15564 if (kind == PyUnicode_1BYTE_KIND) {
15565 char_size = 1;
15566 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15567 share_utf8 = 1;
15568 }
15569 else if (kind == PyUnicode_2BYTE_KIND) {
15570 char_size = 2;
15571 if (sizeof(wchar_t) == 2)
15572 share_wstr = 1;
15573 }
15574 else {
15575 assert(kind == PyUnicode_4BYTE_KIND);
15576 char_size = 4;
15577 if (sizeof(wchar_t) == 4)
15578 share_wstr = 1;
15579 }
15580
15581 /* Ensure we won't overflow the length. */
15582 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15583 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015584 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015585 }
Victor Stinner32bd68c2020-12-01 10:37:39 +010015586 data = PyObject_Malloc((length + 1) * char_size);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015587 if (data == NULL) {
15588 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015589 goto onError;
15590 }
15591
Victor Stinnerc3c74152011-10-02 20:39:55 +020015592 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015593 if (share_utf8) {
15594 _PyUnicode_UTF8_LENGTH(self) = length;
15595 _PyUnicode_UTF8(self) = data;
15596 }
15597 if (share_wstr) {
15598 _PyUnicode_WSTR_LENGTH(self) = length;
15599 _PyUnicode_WSTR(self) = (wchar_t *)data;
15600 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015601
Christian Heimesf051e432016-09-13 20:22:02 +020015602 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015603 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015604 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015605#ifdef Py_DEBUG
15606 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15607#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +010015608 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015609
15610onError:
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015611 Py_DECREF(self);
15612 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015613}
15614
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015615PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015616"str(object='') -> str\n\
15617str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015618\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015619Create a new string object from the given object. If encoding or\n\
15620errors is specified, then the object must expose a data buffer\n\
15621that will be decoded using the given encoding and error handler.\n\
15622Otherwise, returns the result of object.__str__() (if defined)\n\
15623or repr(object).\n\
15624encoding defaults to sys.getdefaultencoding().\n\
15625errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015626
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015627static PyObject *unicode_iter(PyObject *seq);
15628
Guido van Rossumd57fd912000-03-10 22:53:23 +000015629PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015630 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015631 "str", /* tp_name */
15632 sizeof(PyUnicodeObject), /* tp_basicsize */
15633 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015634 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015635 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015636 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015637 0, /* tp_getattr */
15638 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015639 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015640 unicode_repr, /* tp_repr */
15641 &unicode_as_number, /* tp_as_number */
15642 &unicode_as_sequence, /* tp_as_sequence */
15643 &unicode_as_mapping, /* tp_as_mapping */
15644 (hashfunc) unicode_hash, /* tp_hash*/
15645 0, /* tp_call*/
15646 (reprfunc) unicode_str, /* tp_str */
15647 PyObject_GenericGetAttr, /* tp_getattro */
15648 0, /* tp_setattro */
15649 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015650 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Brandt Bucher145bf262021-02-26 14:51:55 -080015651 Py_TPFLAGS_UNICODE_SUBCLASS |
15652 _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
Bupfc93bd42018-06-19 03:59:55 -050015653 unicode_doc, /* tp_doc */
15654 0, /* tp_traverse */
15655 0, /* tp_clear */
15656 PyUnicode_RichCompare, /* tp_richcompare */
15657 0, /* tp_weaklistoffset */
15658 unicode_iter, /* tp_iter */
15659 0, /* tp_iternext */
15660 unicode_methods, /* tp_methods */
15661 0, /* tp_members */
15662 0, /* tp_getset */
15663 &PyBaseObject_Type, /* tp_base */
15664 0, /* tp_dict */
15665 0, /* tp_descr_get */
15666 0, /* tp_descr_set */
15667 0, /* tp_dictoffset */
15668 0, /* tp_init */
15669 0, /* tp_alloc */
15670 unicode_new, /* tp_new */
15671 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015672};
15673
15674/* Initialize the Unicode implementation */
15675
Victor Stinner331a6a52019-05-27 16:39:22 +020015676PyStatus
Victor Stinnerbcb094b2021-02-19 15:10:45 +010015677_PyUnicode_Init(PyInterpreterState *interp)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015678{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015679 /* XXX - move this array to unicodectype.c ? */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015680 const Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015681 0x000A, /* LINE FEED */
15682 0x000D, /* CARRIAGE RETURN */
15683 0x001C, /* FILE SEPARATOR */
15684 0x001D, /* GROUP SEPARATOR */
15685 0x001E, /* RECORD SEPARATOR */
15686 0x0085, /* NEXT LINE */
15687 0x2028, /* LINE SEPARATOR */
15688 0x2029, /* PARAGRAPH SEPARATOR */
15689 };
15690
Victor Stinnerbcb094b2021-02-19 15:10:45 +010015691 struct _Py_unicode_state *state = &interp->unicode;
Victor Stinner91698d82020-06-25 14:07:40 +020015692 if (unicode_create_empty_string_singleton(state) < 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015693 return _PyStatus_NO_MEMORY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015694 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015695
Victor Stinnerbcb094b2021-02-19 15:10:45 +010015696 if (_Py_IsMainInterpreter(interp)) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015697 /* initialize the linebreak bloom filter */
15698 bloom_linebreak = make_bloom_mask(
15699 PyUnicode_2BYTE_KIND, linebreak,
15700 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters477c8d52006-05-27 19:21:47 +000015701
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015702 if (PyType_Ready(&PyUnicode_Type) < 0) {
15703 return _PyStatus_ERR("Can't initialize unicode type");
15704 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015705
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015706 if (PyType_Ready(&EncodingMapType) < 0) {
15707 return _PyStatus_ERR("Can't initialize encoding map type");
15708 }
15709 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15710 return _PyStatus_ERR("Can't initialize field name iterator type");
15711 }
15712 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15713 return _PyStatus_ERR("Can't initialize formatter iter type");
15714 }
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015715 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015716 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015717}
15718
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015719
Walter Dörwald16807132007-05-25 13:52:07 +000015720void
15721PyUnicode_InternInPlace(PyObject **p)
15722{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015723 PyObject *s = *p;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015724#ifdef Py_DEBUG
15725 assert(s != NULL);
15726 assert(_PyUnicode_CHECK(s));
15727#else
Victor Stinner607b1022020-05-05 18:50:30 +020015728 if (s == NULL || !PyUnicode_Check(s)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020015729 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015730 }
Victor Stinner4fae54c2011-10-03 02:01:52 +020015731#endif
Victor Stinner607b1022020-05-05 18:50:30 +020015732
Benjamin Peterson14339b62009-01-31 16:36:08 +000015733 /* If it's a subclass, we don't really know what putting
15734 it in the interned dict might do. */
Victor Stinner607b1022020-05-05 18:50:30 +020015735 if (!PyUnicode_CheckExact(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015736 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015737 }
15738
15739 if (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015740 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015741 }
15742
Victor Stinner666ecfb2020-07-02 01:19:57 +020015743 if (PyUnicode_READY(s) == -1) {
15744 PyErr_Clear();
15745 return;
15746 }
15747
Victor Stinnerea251802020-12-26 02:58:33 +010015748 struct _Py_unicode_state *state = get_unicode_state();
15749 if (state->interned == NULL) {
15750 state->interned = PyDict_New();
15751 if (state->interned == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015752 PyErr_Clear(); /* Don't leave an exception */
15753 return;
15754 }
15755 }
Victor Stinner607b1022020-05-05 18:50:30 +020015756
Victor Stinnerea251802020-12-26 02:58:33 +010015757 PyObject *t = PyDict_SetDefault(state->interned, s, s);
Berker Peksagced8d4c2016-07-25 04:40:39 +030015758 if (t == NULL) {
15759 PyErr_Clear();
15760 return;
15761 }
Victor Stinner607b1022020-05-05 18:50:30 +020015762
Berker Peksagced8d4c2016-07-25 04:40:39 +030015763 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015764 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015765 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015766 return;
15767 }
Victor Stinner607b1022020-05-05 18:50:30 +020015768
Victor Stinner3549ca32020-07-03 16:59:12 +020015769 /* The two references in interned dict (key and value) are not counted by
15770 refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
15771 this. */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015772 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015773 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015774}
15775
Victor Stinnerea251802020-12-26 02:58:33 +010015776
Walter Dörwald16807132007-05-25 13:52:07 +000015777void
15778PyUnicode_InternImmortal(PyObject **p)
15779{
Victor Stinner583ee5a2020-10-02 14:49:00 +020015780 if (PyErr_WarnEx(PyExc_DeprecationWarning,
15781 "PyUnicode_InternImmortal() is deprecated; "
15782 "use PyUnicode_InternInPlace() instead", 1) < 0)
15783 {
15784 // The function has no return value, the exception cannot
15785 // be reported to the caller, so just log it.
15786 PyErr_WriteUnraisable(NULL);
15787 }
15788
Benjamin Peterson14339b62009-01-31 16:36:08 +000015789 PyUnicode_InternInPlace(p);
15790 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015791 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015792 Py_INCREF(*p);
15793 }
Walter Dörwald16807132007-05-25 13:52:07 +000015794}
15795
15796PyObject *
15797PyUnicode_InternFromString(const char *cp)
15798{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015799 PyObject *s = PyUnicode_FromString(cp);
15800 if (s == NULL)
15801 return NULL;
15802 PyUnicode_InternInPlace(&s);
15803 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015804}
15805
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015806
Victor Stinner666ecfb2020-07-02 01:19:57 +020015807void
Victor Stinnerbcb094b2021-02-19 15:10:45 +010015808_PyUnicode_ClearInterned(PyInterpreterState *interp)
Walter Dörwald16807132007-05-25 13:52:07 +000015809{
Victor Stinnerbcb094b2021-02-19 15:10:45 +010015810 struct _Py_unicode_state *state = &interp->unicode;
Victor Stinnerea251802020-12-26 02:58:33 +010015811 if (state->interned == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015812 return;
15813 }
Victor Stinnerea251802020-12-26 02:58:33 +010015814 assert(PyDict_CheckExact(state->interned));
Victor Stinner666ecfb2020-07-02 01:19:57 +020015815
15816 /* Interned unicode strings are not forcibly deallocated; rather, we give
15817 them their stolen references back, and then clear and DECREF the
15818 interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015819
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015820#ifdef INTERNED_STATS
Victor Stinnerea251802020-12-26 02:58:33 +010015821 fprintf(stderr, "releasing %zd interned strings\n",
15822 PyDict_GET_SIZE(state->interned));
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015823
15824 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015825#endif
Victor Stinnerea251802020-12-26 02:58:33 +010015826 Py_ssize_t pos = 0;
15827 PyObject *s, *ignored_value;
15828 while (PyDict_Next(state->interned, &pos, &s, &ignored_value)) {
Victor Stinner666ecfb2020-07-02 01:19:57 +020015829 assert(PyUnicode_IS_READY(s));
15830
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015831 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015832 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015833 Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015834#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015835 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015836#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015837 break;
15838 case SSTATE_INTERNED_MORTAL:
Victor Stinner3549ca32020-07-03 16:59:12 +020015839 // Restore the two references (key and value) ignored
15840 // by PyUnicode_InternInPlace().
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015841 Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015842#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015843 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015844#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015845 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015846 case SSTATE_NOT_INTERNED:
15847 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015848 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015849 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015850 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015851 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015852 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015853#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015854 fprintf(stderr,
15855 "total size of all interned strings: %zd/%zd mortal/immortal\n",
15856 mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015857#endif
Victor Stinner666ecfb2020-07-02 01:19:57 +020015858
Victor Stinnerea251802020-12-26 02:58:33 +010015859 PyDict_Clear(state->interned);
15860 Py_CLEAR(state->interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015861}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015862
15863
15864/********************* Unicode Iterator **************************/
15865
15866typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015867 PyObject_HEAD
15868 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015869 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015870} unicodeiterobject;
15871
15872static void
15873unicodeiter_dealloc(unicodeiterobject *it)
15874{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015875 _PyObject_GC_UNTRACK(it);
15876 Py_XDECREF(it->it_seq);
15877 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015878}
15879
15880static int
15881unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15882{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015883 Py_VISIT(it->it_seq);
15884 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015885}
15886
15887static PyObject *
15888unicodeiter_next(unicodeiterobject *it)
15889{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015890 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015891
Benjamin Peterson14339b62009-01-31 16:36:08 +000015892 assert(it != NULL);
15893 seq = it->it_seq;
15894 if (seq == NULL)
15895 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015896 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015897
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015898 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15899 int kind = PyUnicode_KIND(seq);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015900 const void *data = PyUnicode_DATA(seq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015901 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15902 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015903 if (item != NULL)
15904 ++it->it_index;
15905 return item;
15906 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015907
Benjamin Peterson14339b62009-01-31 16:36:08 +000015908 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015909 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015910 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015911}
15912
15913static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015914unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015915{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015916 Py_ssize_t len = 0;
15917 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015918 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015919 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015920}
15921
15922PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15923
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015924static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015925unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015926{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015927 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015928 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015929 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015930 it->it_seq, it->it_index);
15931 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015932 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015933 if (u == NULL)
15934 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015935 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015936 }
15937}
15938
15939PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15940
15941static PyObject *
15942unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15943{
15944 Py_ssize_t index = PyLong_AsSsize_t(state);
15945 if (index == -1 && PyErr_Occurred())
15946 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015947 if (it->it_seq != NULL) {
15948 if (index < 0)
15949 index = 0;
15950 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15951 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15952 it->it_index = index;
15953 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015954 Py_RETURN_NONE;
15955}
15956
15957PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15958
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015959static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015960 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015961 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015962 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15963 reduce_doc},
15964 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15965 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015966 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015967};
15968
15969PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015970 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15971 "str_iterator", /* tp_name */
15972 sizeof(unicodeiterobject), /* tp_basicsize */
15973 0, /* tp_itemsize */
15974 /* methods */
15975 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015976 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015977 0, /* tp_getattr */
15978 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015979 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015980 0, /* tp_repr */
15981 0, /* tp_as_number */
15982 0, /* tp_as_sequence */
15983 0, /* tp_as_mapping */
15984 0, /* tp_hash */
15985 0, /* tp_call */
15986 0, /* tp_str */
15987 PyObject_GenericGetAttr, /* tp_getattro */
15988 0, /* tp_setattro */
15989 0, /* tp_as_buffer */
15990 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15991 0, /* tp_doc */
15992 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15993 0, /* tp_clear */
15994 0, /* tp_richcompare */
15995 0, /* tp_weaklistoffset */
15996 PyObject_SelfIter, /* tp_iter */
15997 (iternextfunc)unicodeiter_next, /* tp_iternext */
15998 unicodeiter_methods, /* tp_methods */
15999 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016000};
16001
16002static PyObject *
16003unicode_iter(PyObject *seq)
16004{
Benjamin Peterson14339b62009-01-31 16:36:08 +000016005 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016006
Benjamin Peterson14339b62009-01-31 16:36:08 +000016007 if (!PyUnicode_Check(seq)) {
16008 PyErr_BadInternalCall();
16009 return NULL;
16010 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020016011 if (PyUnicode_READY(seq) == -1)
16012 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000016013 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
16014 if (it == NULL)
16015 return NULL;
16016 it->it_index = 0;
16017 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020016018 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000016019 _PyObject_GC_TRACK(it);
16020 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016021}
16022
Victor Stinner709d23d2019-05-02 14:56:30 -040016023static int
16024encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016025{
Victor Stinner709d23d2019-05-02 14:56:30 -040016026 int res;
16027 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
16028 if (res == -2) {
16029 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
16030 return -1;
16031 }
16032 if (res < 0) {
16033 PyErr_NoMemory();
16034 return -1;
16035 }
16036 return 0;
16037}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016038
Victor Stinner709d23d2019-05-02 14:56:30 -040016039
16040static int
16041config_get_codec_name(wchar_t **config_encoding)
16042{
16043 char *encoding;
16044 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
16045 return -1;
16046 }
16047
16048 PyObject *name_obj = NULL;
16049 PyObject *codec = _PyCodec_Lookup(encoding);
16050 PyMem_RawFree(encoding);
16051
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016052 if (!codec)
16053 goto error;
16054
16055 name_obj = PyObject_GetAttrString(codec, "name");
16056 Py_CLEAR(codec);
16057 if (!name_obj) {
16058 goto error;
16059 }
16060
Victor Stinner709d23d2019-05-02 14:56:30 -040016061 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16062 Py_DECREF(name_obj);
16063 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016064 goto error;
16065 }
16066
Victor Stinner709d23d2019-05-02 14:56:30 -040016067 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16068 if (raw_wname == NULL) {
16069 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016070 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040016071 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016072 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016073
16074 PyMem_RawFree(*config_encoding);
16075 *config_encoding = raw_wname;
16076
16077 PyMem_Free(wname);
16078 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016079
16080error:
16081 Py_XDECREF(codec);
16082 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040016083 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016084}
16085
16086
Victor Stinner331a6a52019-05-27 16:39:22 +020016087static PyStatus
Victor Stinnerbcb094b2021-02-19 15:10:45 +010016088init_stdio_encoding(PyInterpreterState *interp)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016089{
Victor Stinner709d23d2019-05-02 14:56:30 -040016090 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerbcb094b2021-02-19 15:10:45 +010016091 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016092 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016093 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016094 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016095 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016096 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016097}
16098
16099
Victor Stinner709d23d2019-05-02 14:56:30 -040016100static int
16101init_fs_codec(PyInterpreterState *interp)
16102{
Victor Stinnerda7933e2020-04-13 03:04:28 +020016103 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016104
16105 _Py_error_handler error_handler;
16106 error_handler = get_error_handler_wide(config->filesystem_errors);
16107 if (error_handler == _Py_ERROR_UNKNOWN) {
16108 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
16109 return -1;
16110 }
16111
16112 char *encoding, *errors;
16113 if (encode_wstr_utf8(config->filesystem_encoding,
16114 &encoding,
16115 "filesystem_encoding") < 0) {
16116 return -1;
16117 }
16118
16119 if (encode_wstr_utf8(config->filesystem_errors,
16120 &errors,
16121 "filesystem_errors") < 0) {
16122 PyMem_RawFree(encoding);
16123 return -1;
16124 }
16125
Victor Stinner3d17c042020-05-14 01:48:38 +020016126 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16127 PyMem_RawFree(fs_codec->encoding);
16128 fs_codec->encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016129 /* encoding has been normalized by init_fs_encoding() */
Victor Stinner3d17c042020-05-14 01:48:38 +020016130 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16131 PyMem_RawFree(fs_codec->errors);
16132 fs_codec->errors = errors;
16133 fs_codec->error_handler = error_handler;
Victor Stinner709d23d2019-05-02 14:56:30 -040016134
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016135#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +020016136 assert(fs_codec->utf8 == 1);
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016137#endif
16138
Victor Stinner709d23d2019-05-02 14:56:30 -040016139 /* At this point, PyUnicode_EncodeFSDefault() and
16140 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16141 the C implementation of the filesystem encoding. */
16142
16143 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16144 global configuration variables. */
Victor Stinner3d17c042020-05-14 01:48:38 +020016145 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16146 fs_codec->errors) < 0) {
Victor Stinner709d23d2019-05-02 14:56:30 -040016147 PyErr_NoMemory();
16148 return -1;
16149 }
16150 return 0;
16151}
16152
16153
Victor Stinner331a6a52019-05-27 16:39:22 +020016154static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016155init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016156{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016157 PyInterpreterState *interp = tstate->interp;
16158
Victor Stinner709d23d2019-05-02 14:56:30 -040016159 /* Update the filesystem encoding to the normalized Python codec name.
16160 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16161 (Python codec name). */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016162 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016163 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016164 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016165 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016166 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016167 }
16168
Victor Stinner709d23d2019-05-02 14:56:30 -040016169 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016170 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016171 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016172 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016173}
16174
16175
Victor Stinner331a6a52019-05-27 16:39:22 +020016176PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020016177_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016178{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016179 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016180 if (_PyStatus_EXCEPTION(status)) {
16181 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016182 }
16183
Victor Stinnerbcb094b2021-02-19 15:10:45 +010016184 return init_stdio_encoding(tstate->interp);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016185}
16186
16187
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016188static void
Victor Stinner3d17c042020-05-14 01:48:38 +020016189_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016190{
Victor Stinner3d17c042020-05-14 01:48:38 +020016191 PyMem_RawFree(fs_codec->encoding);
16192 fs_codec->encoding = NULL;
16193 fs_codec->utf8 = 0;
16194 PyMem_RawFree(fs_codec->errors);
16195 fs_codec->errors = NULL;
16196 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016197}
16198
16199
Victor Stinner709d23d2019-05-02 14:56:30 -040016200#ifdef MS_WINDOWS
16201int
16202_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16203{
Victor Stinner81a7be32020-04-14 15:14:01 +020016204 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerda7933e2020-04-13 03:04:28 +020016205 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016206
16207 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16208 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16209 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16210 if (encoding == NULL || errors == NULL) {
16211 PyMem_RawFree(encoding);
16212 PyMem_RawFree(errors);
16213 PyErr_NoMemory();
16214 return -1;
16215 }
16216
16217 PyMem_RawFree(config->filesystem_encoding);
16218 config->filesystem_encoding = encoding;
16219 PyMem_RawFree(config->filesystem_errors);
16220 config->filesystem_errors = errors;
16221
16222 return init_fs_codec(interp);
16223}
16224#endif
16225
16226
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016227void
Victor Stinnerbcb094b2021-02-19 15:10:45 +010016228_PyUnicode_Fini(PyInterpreterState *interp)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016229{
Victor Stinnerbcb094b2021-02-19 15:10:45 +010016230 struct _Py_unicode_state *state = &interp->unicode;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016231
Victor Stinnerea251802020-12-26 02:58:33 +010016232 // _PyUnicode_ClearInterned() must be called before
16233 assert(state->interned == NULL);
16234
16235 _PyUnicode_FiniEncodings(&state->fs_codec);
16236
Victor Stinnerf4507232020-12-26 20:26:08 +010016237 unicode_clear_identifiers(state);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016238
Victor Stinner2f9ada92020-06-24 02:22:21 +020016239 for (Py_ssize_t i = 0; i < 256; i++) {
16240 Py_CLEAR(state->latin1[i]);
16241 }
Victor Stinnerea251802020-12-26 02:58:33 +010016242 Py_CLEAR(state->empty_string);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016243}
16244
16245
Georg Brandl66c221e2010-10-14 07:04:07 +000016246/* A _string module, to export formatter_parser and formatter_field_name_split
16247 to the string.Formatter class implemented in Python. */
16248
16249static PyMethodDef _string_methods[] = {
16250 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16251 METH_O, PyDoc_STR("split the argument as a field name")},
16252 {"formatter_parser", (PyCFunction) formatter_parser,
16253 METH_O, PyDoc_STR("parse the argument as a format string")},
16254 {NULL, NULL}
16255};
16256
16257static struct PyModuleDef _string_module = {
16258 PyModuleDef_HEAD_INIT,
Victor Stinnerbb083d32020-09-08 15:33:08 +020016259 .m_name = "_string",
16260 .m_doc = PyDoc_STR("string helper module"),
16261 .m_size = 0,
16262 .m_methods = _string_methods,
Georg Brandl66c221e2010-10-14 07:04:07 +000016263};
16264
16265PyMODINIT_FUNC
16266PyInit__string(void)
16267{
Victor Stinnerbb083d32020-09-08 15:33:08 +020016268 return PyModuleDef_Init(&_string_module);
Georg Brandl66c221e2010-10-14 07:04:07 +000016269}
16270
16271
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016272#ifdef __cplusplus
16273}
16274#endif