blob: ad32a062d4854481477e93aa9753d85b6bb68ee9 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner47e1afd2020-10-26 16:43:47 +010043#include "pycore_abstract.h" // _PyIndex_Check()
Victor Stinnerba3d67c2020-12-26 00:41:46 +010044#include "pycore_atomic_funcs.h" // _Py_atomic_size_get()
Victor Stinner47e1afd2020-10-26 16:43:47 +010045#include "pycore_bytes_methods.h" // _Py_bytes_lower()
Serhiy Storchaka2ad93822020-12-03 12:46:16 +020046#include "pycore_format.h" // F_LJUST
Victor Stinner47e1afd2020-10-26 16:43:47 +010047#include "pycore_initconfig.h" // _PyStatus_OK()
48#include "pycore_interp.h" // PyInterpreterState.fs_codec
49#include "pycore_object.h" // _PyObject_GC_TRACK()
50#include "pycore_pathconfig.h" // _Py_DumpPathConfig()
51#include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding()
52#include "pycore_pystate.h" // _PyInterpreterState_GET()
53#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
54#include "stringlib/eq.h" // unicode_eq()
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000056#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000057#include <windows.h>
58#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059
Victor Stinner666ecfb2020-07-02 01:19:57 +020060/* Uncomment to display statistics on interned strings at exit
61 in _PyUnicode_ClearInterned(). */
Victor Stinnerfecc4f22019-03-19 14:20:29 +010062/* #define INTERNED_STATS 1 */
63
64
Larry Hastings61272b72014-01-07 12:41:53 -080065/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090066class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080067[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090068/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
69
70/*[python input]
71class Py_UCS4_converter(CConverter):
72 type = 'Py_UCS4'
73 converter = 'convert_uc'
74
75 def converter_init(self):
76 if self.default is not unspecified:
77 self.c_default = ascii(self.default)
78 if len(self.c_default) > 4 or self.c_default[0] != "'":
79 self.c_default = hex(ord(self.default))
80
81[python start generated code]*/
82/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080083
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
Serhiy Storchaka05997252013-01-26 12:14:02 +020086NOTE: In the interpreter's initialization phase, some globals are currently
87 initialized dynamically as needed. In the process Unicode objects may
88 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000089
90*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000091
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000092
93#ifdef __cplusplus
94extern "C" {
95#endif
96
Victor Stinner8faf8212011-12-08 22:14:11 +010097/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
98#define MAX_UNICODE 0x10ffff
99
Victor Stinner910337b2011-10-03 03:20:16 +0200100#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200101# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +0200102#else
103# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
104#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200105
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106#define _PyUnicode_UTF8(op) \
107 (((PyCompactUnicodeObject*)(op))->utf8)
108#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200109 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200110 assert(PyUnicode_IS_READY(op)), \
111 PyUnicode_IS_COMPACT_ASCII(op) ? \
112 ((char*)((PyASCIIObject*)(op) + 1)) : \
113 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200114#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200115 (((PyCompactUnicodeObject*)(op))->utf8_length)
116#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200117 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200118 assert(PyUnicode_IS_READY(op)), \
119 PyUnicode_IS_COMPACT_ASCII(op) ? \
120 ((PyASCIIObject*)(op))->length : \
121 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200122#define _PyUnicode_WSTR(op) \
123 (((PyASCIIObject*)(op))->wstr)
Inada Naoki2c4928d2020-06-17 20:09:44 +0900124
125/* Don't use deprecated macro of unicodeobject.h */
126#undef PyUnicode_WSTR_LENGTH
127#define PyUnicode_WSTR_LENGTH(op) \
128 (PyUnicode_IS_COMPACT_ASCII(op) ? \
129 ((PyASCIIObject*)op)->length : \
130 ((PyCompactUnicodeObject*)op)->wstr_length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200131#define _PyUnicode_WSTR_LENGTH(op) \
132 (((PyCompactUnicodeObject*)(op))->wstr_length)
133#define _PyUnicode_LENGTH(op) \
134 (((PyASCIIObject *)(op))->length)
135#define _PyUnicode_STATE(op) \
136 (((PyASCIIObject *)(op))->state)
137#define _PyUnicode_HASH(op) \
138 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200139#define _PyUnicode_KIND(op) \
140 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200141 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200142#define _PyUnicode_GET_LENGTH(op) \
143 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200144 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200145#define _PyUnicode_DATA_ANY(op) \
146 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200147
Victor Stinner910337b2011-10-03 03:20:16 +0200148#undef PyUnicode_READY
149#define PyUnicode_READY(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200152 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100153 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200154
Victor Stinnerc379ead2011-10-03 12:52:27 +0200155#define _PyUnicode_SHARE_UTF8(op) \
156 (assert(_PyUnicode_CHECK(op)), \
157 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
158 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
159#define _PyUnicode_SHARE_WSTR(op) \
160 (assert(_PyUnicode_CHECK(op)), \
161 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
162
Victor Stinner829c0ad2011-10-03 01:08:02 +0200163/* true if the Unicode object has an allocated UTF-8 memory block
164 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200165#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200166 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200167 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200168 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
169
Victor Stinner03490912011-10-03 23:45:12 +0200170/* true if the Unicode object has an allocated wstr memory block
171 (not shared with other data) */
172#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200173 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200174 (!PyUnicode_IS_READY(op) || \
175 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
176
Victor Stinner910337b2011-10-03 03:20:16 +0200177/* Generic helper macro to convert characters of different types.
178 from_type and to_type have to be valid type names, begin and end
179 are pointers to the source characters which should be of type
180 "from_type *". to is a pointer of type "to_type *" and points to the
181 buffer where the result characters are written to. */
182#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
183 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100184 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600185 const from_type *_iter = (const from_type *)(begin);\
186 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200187 Py_ssize_t n = (_end) - (_iter); \
188 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200189 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200190 while (_iter < (_unrolled_end)) { \
191 _to[0] = (to_type) _iter[0]; \
192 _to[1] = (to_type) _iter[1]; \
193 _to[2] = (to_type) _iter[2]; \
194 _to[3] = (to_type) _iter[3]; \
195 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200196 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200197 while (_iter < (_end)) \
198 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200199 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200200
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200201#ifdef MS_WINDOWS
202 /* On Windows, overallocate by 50% is the best factor */
203# define OVERALLOCATE_FACTOR 2
204#else
205 /* On Linux, overallocate by 25% is the best factor */
206# define OVERALLOCATE_FACTOR 4
207#endif
208
Walter Dörwald16807132007-05-25 13:52:07 +0000209
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200210static struct _Py_unicode_state*
211get_unicode_state(void)
212{
213 PyInterpreterState *interp = _PyInterpreterState_GET();
214 return &interp->unicode;
215}
Serhiy Storchaka05997252013-01-26 12:14:02 +0200216
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000217
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200218// Return a borrowed reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200219static inline PyObject* unicode_get_empty(void)
220{
221 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner90ed8a62020-06-24 00:34:07 +0200222 // unicode_get_empty() must not be called before _PyUnicode_Init()
223 // or after _PyUnicode_Fini()
Victor Stinner91698d82020-06-25 14:07:40 +0200224 assert(state->empty_string != NULL);
225 return state->empty_string;
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200226}
227
Victor Stinner91698d82020-06-25 14:07:40 +0200228
229// Return a strong reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200230static inline PyObject* unicode_new_empty(void)
231{
Victor Stinner90ed8a62020-06-24 00:34:07 +0200232 PyObject *empty = unicode_get_empty();
233 Py_INCREF(empty);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200234 return empty;
235}
236
237#define _Py_RETURN_UNICODE_EMPTY() \
238 do { \
239 return unicode_new_empty(); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200240 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000241
Victor Stinner59423e32018-11-26 13:40:01 +0100242static inline void
243unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
244 Py_ssize_t start, Py_ssize_t length)
245{
246 assert(0 <= start);
247 assert(kind != PyUnicode_WCHAR_KIND);
248 switch (kind) {
249 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100250 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100251 Py_UCS1 ch = (unsigned char)value;
252 Py_UCS1 *to = (Py_UCS1 *)data + start;
253 memset(to, ch, length);
254 break;
255 }
256 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100257 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100258 Py_UCS2 ch = (Py_UCS2)value;
259 Py_UCS2 *to = (Py_UCS2 *)data + start;
260 const Py_UCS2 *end = to + length;
261 for (; to < end; ++to) *to = ch;
262 break;
263 }
264 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100265 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100266 Py_UCS4 ch = value;
267 Py_UCS4 * to = (Py_UCS4 *)data + start;
268 const Py_UCS4 *end = to + length;
269 for (; to < end; ++to) *to = ch;
270 break;
271 }
272 default: Py_UNREACHABLE();
273 }
274}
275
276
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200277/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700278static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200279_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900280static inline void
281_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400282static PyObject *
283unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
284 const char *errors);
285static PyObject *
286unicode_decode_utf8(const char *s, Py_ssize_t size,
287 _Py_error_handler error_handler, const char *errors,
288 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200289
Christian Heimes190d79e2008-01-30 11:58:22 +0000290/* Fast detection of the most frequent whitespace characters */
291const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000293/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000294/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000295/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000296/* case 0x000C: * FORM FEED */
297/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000298 0, 1, 1, 1, 1, 1, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000300/* case 0x001C: * FILE SEPARATOR */
301/* case 0x001D: * GROUP SEPARATOR */
302/* case 0x001E: * RECORD SEPARATOR */
303/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000304 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000305/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000306 1, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000310
Benjamin Peterson14339b62009-01-31 16:36:08 +0000311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0,
315 0, 0, 0, 0, 0, 0, 0, 0,
316 0, 0, 0, 0, 0, 0, 0, 0,
317 0, 0, 0, 0, 0, 0, 0, 0,
318 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000319};
320
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200321/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200322static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200323static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100324static int unicode_modifiable(PyObject *unicode);
325
Victor Stinnerfe226c02011-10-03 03:52:20 +0200326
Alexander Belopolsky40018472011-02-26 01:02:56 +0000327static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100328_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200329static PyObject *
330_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
331static PyObject *
332_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
333
334static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000335unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000336 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100337 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000338 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
339
Alexander Belopolsky40018472011-02-26 01:02:56 +0000340static void
341raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300342 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100343 PyObject *unicode,
344 Py_ssize_t startpos, Py_ssize_t endpos,
345 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000346
Christian Heimes190d79e2008-01-30 11:58:22 +0000347/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200348static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000349 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000350/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000351/* 0x000B, * LINE TABULATION */
352/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000353/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000354 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000355 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000356/* 0x001C, * FILE SEPARATOR */
357/* 0x001D, * GROUP SEPARATOR */
358/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000359 0, 0, 0, 0, 1, 1, 1, 0,
360 0, 0, 0, 0, 0, 0, 0, 0,
361 0, 0, 0, 0, 0, 0, 0, 0,
362 0, 0, 0, 0, 0, 0, 0, 0,
363 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000364
Benjamin Peterson14339b62009-01-31 16:36:08 +0000365 0, 0, 0, 0, 0, 0, 0, 0,
366 0, 0, 0, 0, 0, 0, 0, 0,
367 0, 0, 0, 0, 0, 0, 0, 0,
368 0, 0, 0, 0, 0, 0, 0, 0,
369 0, 0, 0, 0, 0, 0, 0, 0,
370 0, 0, 0, 0, 0, 0, 0, 0,
371 0, 0, 0, 0, 0, 0, 0, 0,
372 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000373};
374
INADA Naoki3ae20562017-01-16 20:41:20 +0900375static int convert_uc(PyObject *obj, void *addr);
376
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300377#include "clinic/unicodeobject.c.h"
378
Victor Stinner3d4226a2018-08-29 22:21:32 +0200379_Py_error_handler
380_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200381{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200382 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200383 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200384 }
385 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200386 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200387 }
388 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200389 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200390 }
391 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200392 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200393 }
394 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200395 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200396 }
397 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200398 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200399 }
400 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200401 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200402 }
Victor Stinner50149202015-09-22 00:26:54 +0200403 return _Py_ERROR_OTHER;
404}
405
Victor Stinner709d23d2019-05-02 14:56:30 -0400406
407static _Py_error_handler
408get_error_handler_wide(const wchar_t *errors)
409{
410 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
411 return _Py_ERROR_STRICT;
412 }
413 if (wcscmp(errors, L"surrogateescape") == 0) {
414 return _Py_ERROR_SURROGATEESCAPE;
415 }
416 if (wcscmp(errors, L"replace") == 0) {
417 return _Py_ERROR_REPLACE;
418 }
419 if (wcscmp(errors, L"ignore") == 0) {
420 return _Py_ERROR_IGNORE;
421 }
422 if (wcscmp(errors, L"backslashreplace") == 0) {
423 return _Py_ERROR_BACKSLASHREPLACE;
424 }
425 if (wcscmp(errors, L"surrogatepass") == 0) {
426 return _Py_ERROR_SURROGATEPASS;
427 }
428 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
429 return _Py_ERROR_XMLCHARREFREPLACE;
430 }
431 return _Py_ERROR_OTHER;
432}
433
434
Victor Stinner22eb6892019-06-26 00:51:05 +0200435static inline int
436unicode_check_encoding_errors(const char *encoding, const char *errors)
437{
438 if (encoding == NULL && errors == NULL) {
439 return 0;
440 }
441
Victor Stinner81a7be32020-04-14 15:14:01 +0200442 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner22eb6892019-06-26 00:51:05 +0200443#ifndef Py_DEBUG
444 /* In release mode, only check in development mode (-X dev) */
Victor Stinnerda7933e2020-04-13 03:04:28 +0200445 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200446 return 0;
447 }
448#else
449 /* Always check in debug mode */
450#endif
451
452 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
453 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
Victor Stinner3d17c042020-05-14 01:48:38 +0200454 if (!interp->unicode.fs_codec.encoding) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200455 return 0;
456 }
457
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200458 /* Disable checks during Python finalization. For example, it allows to
459 call _PyObject_Dump() during finalization for debugging purpose. */
460 if (interp->finalizing) {
461 return 0;
462 }
463
Victor Stinner22eb6892019-06-26 00:51:05 +0200464 if (encoding != NULL) {
465 PyObject *handler = _PyCodec_Lookup(encoding);
466 if (handler == NULL) {
467 return -1;
468 }
469 Py_DECREF(handler);
470 }
471
472 if (errors != NULL) {
473 PyObject *handler = PyCodec_LookupError(errors);
474 if (handler == NULL) {
475 return -1;
476 }
477 Py_DECREF(handler);
478 }
479 return 0;
480}
481
482
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200483int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100484_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200485{
Victor Stinner68762572019-10-07 18:42:01 +0200486#define CHECK(expr) \
487 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
488
Victor Stinner910337b2011-10-03 03:20:16 +0200489 PyASCIIObject *ascii;
490 unsigned int kind;
491
Victor Stinner68762572019-10-07 18:42:01 +0200492 assert(op != NULL);
493 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200494
495 ascii = (PyASCIIObject *)op;
496 kind = ascii->state.kind;
497
Victor Stinnera3b334d2011-10-03 13:53:37 +0200498 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200499 CHECK(kind == PyUnicode_1BYTE_KIND);
500 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200501 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200502 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200503 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200504 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200505
Victor Stinnera41463c2011-10-04 01:05:08 +0200506 if (ascii->state.compact == 1) {
507 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200508 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200509 || kind == PyUnicode_2BYTE_KIND
510 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200511 CHECK(ascii->state.ascii == 0);
512 CHECK(ascii->state.ready == 1);
513 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100514 }
515 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200516 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
517
518 data = unicode->data.any;
519 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200520 CHECK(ascii->length == 0);
521 CHECK(ascii->hash == -1);
522 CHECK(ascii->state.compact == 0);
523 CHECK(ascii->state.ascii == 0);
524 CHECK(ascii->state.ready == 0);
525 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
526 CHECK(ascii->wstr != NULL);
527 CHECK(data == NULL);
528 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200529 }
530 else {
Victor Stinner68762572019-10-07 18:42:01 +0200531 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200532 || kind == PyUnicode_2BYTE_KIND
533 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200534 CHECK(ascii->state.compact == 0);
535 CHECK(ascii->state.ready == 1);
536 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200537 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200538 CHECK(compact->utf8 == data);
539 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200540 }
541 else
Victor Stinner68762572019-10-07 18:42:01 +0200542 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200543 }
544 }
545 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200546 if (
547#if SIZEOF_WCHAR_T == 2
548 kind == PyUnicode_2BYTE_KIND
549#else
550 kind == PyUnicode_4BYTE_KIND
551#endif
552 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200553 {
Victor Stinner68762572019-10-07 18:42:01 +0200554 CHECK(ascii->wstr == data);
555 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200556 } else
Victor Stinner68762572019-10-07 18:42:01 +0200557 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200558 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200559
560 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200561 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200562 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200563 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200564 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200565
566 /* check that the best kind is used: O(n) operation */
567 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200568 Py_ssize_t i;
569 Py_UCS4 maxchar = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300570 const void *data;
Victor Stinner718fbf02012-04-26 00:39:37 +0200571 Py_UCS4 ch;
572
573 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200574 for (i=0; i < ascii->length; i++)
575 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200576 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200577 if (ch > maxchar)
578 maxchar = ch;
579 }
580 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100581 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200582 CHECK(maxchar >= 128);
583 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100584 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200585 else
Victor Stinner68762572019-10-07 18:42:01 +0200586 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200587 }
Victor Stinner77faf692011-11-20 18:56:05 +0100588 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200589 CHECK(maxchar >= 0x100);
590 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100591 }
592 else {
Victor Stinner68762572019-10-07 18:42:01 +0200593 CHECK(maxchar >= 0x10000);
594 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100595 }
Victor Stinner68762572019-10-07 18:42:01 +0200596 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200597 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400598 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200599
600#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400601}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200602
Victor Stinner910337b2011-10-03 03:20:16 +0200603
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100604static PyObject*
605unicode_result_wchar(PyObject *unicode)
606{
607#ifndef Py_DEBUG
608 Py_ssize_t len;
609
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100610 len = _PyUnicode_WSTR_LENGTH(unicode);
611 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100612 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200613 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100614 }
615
616 if (len == 1) {
617 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100618 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100619 Py_DECREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200620 return get_latin1_char((unsigned char)ch);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100621 }
622 }
623
624 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200625 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100626 return NULL;
627 }
628#else
Victor Stinneraa771272012-10-04 02:32:58 +0200629 assert(Py_REFCNT(unicode) == 1);
630
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100631 /* don't make the result ready in debug mode to ensure that the caller
632 makes the string ready before using it */
633 assert(_PyUnicode_CheckConsistency(unicode, 1));
634#endif
635 return unicode;
636}
637
638static PyObject*
639unicode_result_ready(PyObject *unicode)
640{
641 Py_ssize_t length;
642
643 length = PyUnicode_GET_LENGTH(unicode);
644 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200645 PyObject *empty = unicode_get_empty();
646 if (unicode != empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100647 Py_DECREF(unicode);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200648 Py_INCREF(empty);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100649 }
Victor Stinner90ed8a62020-06-24 00:34:07 +0200650 return empty;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100651 }
652
653 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200654 int kind = PyUnicode_KIND(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200655 if (kind == PyUnicode_1BYTE_KIND) {
656 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
657 Py_UCS1 ch = data[0];
658 struct _Py_unicode_state *state = get_unicode_state();
659 PyObject *latin1_char = state->latin1[ch];
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100660 if (latin1_char != NULL) {
661 if (unicode != latin1_char) {
662 Py_INCREF(latin1_char);
663 Py_DECREF(unicode);
664 }
665 return latin1_char;
666 }
667 else {
668 assert(_PyUnicode_CheckConsistency(unicode, 1));
669 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200670 state->latin1[ch] = unicode;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100671 return unicode;
672 }
673 }
Victor Stinner2f9ada92020-06-24 02:22:21 +0200674 else {
675 assert(PyUnicode_READ_CHAR(unicode, 0) >= 256);
676 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100677 }
678
679 assert(_PyUnicode_CheckConsistency(unicode, 1));
680 return unicode;
681}
682
683static PyObject*
684unicode_result(PyObject *unicode)
685{
686 assert(_PyUnicode_CHECK(unicode));
687 if (PyUnicode_IS_READY(unicode))
688 return unicode_result_ready(unicode);
689 else
690 return unicode_result_wchar(unicode);
691}
692
Victor Stinnerc4b49542011-12-11 22:44:26 +0100693static PyObject*
694unicode_result_unchanged(PyObject *unicode)
695{
696 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500697 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100698 return NULL;
699 Py_INCREF(unicode);
700 return unicode;
701 }
702 else
703 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100704 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100705}
706
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200707/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
708 ASCII, Latin1, UTF-8, etc. */
709static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200710backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200711 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
712{
Victor Stinnerad771582015-10-09 12:38:53 +0200713 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200714 Py_UCS4 ch;
715 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300716 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200717
718 assert(PyUnicode_IS_READY(unicode));
719 kind = PyUnicode_KIND(unicode);
720 data = PyUnicode_DATA(unicode);
721
722 size = 0;
723 /* determine replacement size */
724 for (i = collstart; i < collend; ++i) {
725 Py_ssize_t incr;
726
727 ch = PyUnicode_READ(kind, data, i);
728 if (ch < 0x100)
729 incr = 2+2;
730 else if (ch < 0x10000)
731 incr = 2+4;
732 else {
733 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200734 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200735 }
736 if (size > PY_SSIZE_T_MAX - incr) {
737 PyErr_SetString(PyExc_OverflowError,
738 "encoded result is too long for a Python string");
739 return NULL;
740 }
741 size += incr;
742 }
743
Victor Stinnerad771582015-10-09 12:38:53 +0200744 str = _PyBytesWriter_Prepare(writer, str, size);
745 if (str == NULL)
746 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200747
748 /* generate replacement */
749 for (i = collstart; i < collend; ++i) {
750 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200751 *str++ = '\\';
752 if (ch >= 0x00010000) {
753 *str++ = 'U';
754 *str++ = Py_hexdigits[(ch>>28)&0xf];
755 *str++ = Py_hexdigits[(ch>>24)&0xf];
756 *str++ = Py_hexdigits[(ch>>20)&0xf];
757 *str++ = Py_hexdigits[(ch>>16)&0xf];
758 *str++ = Py_hexdigits[(ch>>12)&0xf];
759 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200760 }
Victor Stinner797485e2015-10-09 03:17:30 +0200761 else if (ch >= 0x100) {
762 *str++ = 'u';
763 *str++ = Py_hexdigits[(ch>>12)&0xf];
764 *str++ = Py_hexdigits[(ch>>8)&0xf];
765 }
766 else
767 *str++ = 'x';
768 *str++ = Py_hexdigits[(ch>>4)&0xf];
769 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200770 }
771 return str;
772}
773
774/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
775 ASCII, Latin1, UTF-8, etc. */
776static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200777xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200778 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
779{
Victor Stinnerad771582015-10-09 12:38:53 +0200780 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200781 Py_UCS4 ch;
782 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300783 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200784
785 assert(PyUnicode_IS_READY(unicode));
786 kind = PyUnicode_KIND(unicode);
787 data = PyUnicode_DATA(unicode);
788
789 size = 0;
790 /* determine replacement size */
791 for (i = collstart; i < collend; ++i) {
792 Py_ssize_t incr;
793
794 ch = PyUnicode_READ(kind, data, i);
795 if (ch < 10)
796 incr = 2+1+1;
797 else if (ch < 100)
798 incr = 2+2+1;
799 else if (ch < 1000)
800 incr = 2+3+1;
801 else if (ch < 10000)
802 incr = 2+4+1;
803 else if (ch < 100000)
804 incr = 2+5+1;
805 else if (ch < 1000000)
806 incr = 2+6+1;
807 else {
808 assert(ch <= MAX_UNICODE);
809 incr = 2+7+1;
810 }
811 if (size > PY_SSIZE_T_MAX - incr) {
812 PyErr_SetString(PyExc_OverflowError,
813 "encoded result is too long for a Python string");
814 return NULL;
815 }
816 size += incr;
817 }
818
Victor Stinnerad771582015-10-09 12:38:53 +0200819 str = _PyBytesWriter_Prepare(writer, str, size);
820 if (str == NULL)
821 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200822
823 /* generate replacement */
824 for (i = collstart; i < collend; ++i) {
Christian Heimes07f2ade2020-11-18 16:38:53 +0100825 size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
826 if (size < 0) {
827 return NULL;
828 }
829 str += size;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200830 }
831 return str;
832}
833
Thomas Wouters477c8d52006-05-27 19:21:47 +0000834/* --- Bloom Filters ----------------------------------------------------- */
835
836/* stuff to implement simple "bloom filters" for Unicode characters.
837 to keep things simple, we use a single bitmask, using the least 5
838 bits from each unicode characters as the bit index. */
839
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200840/* the linebreak mask is set up by _PyUnicode_Init() below */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000841
Antoine Pitrouf068f942010-01-13 14:19:12 +0000842#if LONG_BIT >= 128
843#define BLOOM_WIDTH 128
844#elif LONG_BIT >= 64
845#define BLOOM_WIDTH 64
846#elif LONG_BIT >= 32
847#define BLOOM_WIDTH 32
848#else
849#error "LONG_BIT is smaller than 32"
850#endif
851
Thomas Wouters477c8d52006-05-27 19:21:47 +0000852#define BLOOM_MASK unsigned long
853
Serhiy Storchaka05997252013-01-26 12:14:02 +0200854static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000855
Antoine Pitrouf068f942010-01-13 14:19:12 +0000856#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000857
Benjamin Peterson29060642009-01-31 22:14:21 +0000858#define BLOOM_LINEBREAK(ch) \
859 ((ch) < 128U ? ascii_linebreak[(ch)] : \
860 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000861
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700862static inline BLOOM_MASK
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300863make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000864{
Victor Stinnera85af502013-04-09 21:53:54 +0200865#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
866 do { \
867 TYPE *data = (TYPE *)PTR; \
868 TYPE *end = data + LEN; \
869 Py_UCS4 ch; \
870 for (; data != end; data++) { \
871 ch = *data; \
872 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
873 } \
874 break; \
875 } while (0)
876
Thomas Wouters477c8d52006-05-27 19:21:47 +0000877 /* calculate simple bloom-style bitmask for a given unicode string */
878
Antoine Pitrouf068f942010-01-13 14:19:12 +0000879 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000880
881 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200882 switch (kind) {
883 case PyUnicode_1BYTE_KIND:
884 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
885 break;
886 case PyUnicode_2BYTE_KIND:
887 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
888 break;
889 case PyUnicode_4BYTE_KIND:
890 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
891 break;
892 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700893 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200894 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000895 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200896
897#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000898}
899
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300900static int
901ensure_unicode(PyObject *obj)
902{
903 if (!PyUnicode_Check(obj)) {
904 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200905 "must be str, not %.100s",
906 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300907 return -1;
908 }
909 return PyUnicode_READY(obj);
910}
911
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200912/* Compilation of templated routines */
913
Victor Stinner90ed8a62020-06-24 00:34:07 +0200914#define STRINGLIB_GET_EMPTY() unicode_get_empty()
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200915
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200916#include "stringlib/asciilib.h"
917#include "stringlib/fastsearch.h"
918#include "stringlib/partition.h"
919#include "stringlib/split.h"
920#include "stringlib/count.h"
921#include "stringlib/find.h"
922#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200923#include "stringlib/undef.h"
924
925#include "stringlib/ucs1lib.h"
926#include "stringlib/fastsearch.h"
927#include "stringlib/partition.h"
928#include "stringlib/split.h"
929#include "stringlib/count.h"
930#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300931#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200932#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200933#include "stringlib/undef.h"
934
935#include "stringlib/ucs2lib.h"
936#include "stringlib/fastsearch.h"
937#include "stringlib/partition.h"
938#include "stringlib/split.h"
939#include "stringlib/count.h"
940#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300941#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200942#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200943#include "stringlib/undef.h"
944
945#include "stringlib/ucs4lib.h"
946#include "stringlib/fastsearch.h"
947#include "stringlib/partition.h"
948#include "stringlib/split.h"
949#include "stringlib/count.h"
950#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300951#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200952#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200953#include "stringlib/undef.h"
954
Inada Naoki2c4928d2020-06-17 20:09:44 +0900955_Py_COMP_DIAG_PUSH
956_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200957#include "stringlib/unicodedefs.h"
958#include "stringlib/fastsearch.h"
959#include "stringlib/count.h"
960#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100961#include "stringlib/undef.h"
Inada Naoki2c4928d2020-06-17 20:09:44 +0900962_Py_COMP_DIAG_POP
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200963
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200964#undef STRINGLIB_GET_EMPTY
965
Guido van Rossumd57fd912000-03-10 22:53:23 +0000966/* --- Unicode Object ----------------------------------------------------- */
967
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700968static inline Py_ssize_t
969findchar(const void *s, int kind,
970 Py_ssize_t size, Py_UCS4 ch,
971 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200972{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200973 switch (kind) {
974 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200975 if ((Py_UCS1) ch != ch)
976 return -1;
977 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600978 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200979 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600980 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200981 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200982 if ((Py_UCS2) ch != ch)
983 return -1;
984 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600985 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200986 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600987 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200988 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200989 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600990 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200991 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600992 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200993 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700994 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200995 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200996}
997
Victor Stinnerafffce42012-10-03 23:03:17 +0200998#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000999/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +02001000 earlier.
1001
1002 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1003 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1004 invalid character in Unicode 6.0. */
1005static void
1006unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1007{
1008 int kind = PyUnicode_KIND(unicode);
1009 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1010 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1011 if (length <= old_length)
1012 return;
1013 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1014}
1015#endif
1016
Victor Stinnerfe226c02011-10-03 03:52:20 +02001017static PyObject*
1018resize_compact(PyObject *unicode, Py_ssize_t length)
1019{
1020 Py_ssize_t char_size;
1021 Py_ssize_t struct_size;
1022 Py_ssize_t new_size;
1023 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001024 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001025#ifdef Py_DEBUG
1026 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1027#endif
1028
Victor Stinner79891572012-05-03 13:43:07 +02001029 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001031 assert(PyUnicode_IS_COMPACT(unicode));
1032
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001033 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001034 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001035 struct_size = sizeof(PyASCIIObject);
1036 else
1037 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001038 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001039
Victor Stinnerfe226c02011-10-03 03:52:20 +02001040 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1041 PyErr_NoMemory();
1042 return NULL;
1043 }
1044 new_size = (struct_size + (length + 1) * char_size);
1045
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001046 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001047 PyObject_Free(_PyUnicode_UTF8(unicode));
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001048 _PyUnicode_UTF8(unicode) = NULL;
1049 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1050 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001051#ifdef Py_REF_DEBUG
1052 _Py_RefTotal--;
1053#endif
1054#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001055 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001056#endif
Victor Stinner84def372011-12-11 20:04:56 +01001057
Victor Stinner32bd68c2020-12-01 10:37:39 +01001058 new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001059 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001060 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001061 PyErr_NoMemory();
1062 return NULL;
1063 }
Victor Stinner84def372011-12-11 20:04:56 +01001064 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001065 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001066
Victor Stinnerfe226c02011-10-03 03:52:20 +02001067 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001068 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001069 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001070 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001071 _PyUnicode_WSTR_LENGTH(unicode) = length;
1072 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001073 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001074 PyObject_Free(_PyUnicode_WSTR(unicode));
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001075 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001076 if (!PyUnicode_IS_ASCII(unicode))
1077 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001078 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001079#ifdef Py_DEBUG
1080 unicode_fill_invalid(unicode, old_length);
1081#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001082 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1083 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001084 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001085 return unicode;
1086}
1087
Alexander Belopolsky40018472011-02-26 01:02:56 +00001088static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001089resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001090{
Victor Stinner95663112011-10-04 01:03:50 +02001091 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001092 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001093 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001094 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001095
Victor Stinnerfe226c02011-10-03 03:52:20 +02001096 if (PyUnicode_IS_READY(unicode)) {
1097 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001098 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001099 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001100#ifdef Py_DEBUG
1101 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1102#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001103
1104 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001105 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001106 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1107 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001108
1109 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1110 PyErr_NoMemory();
1111 return -1;
1112 }
1113 new_size = (length + 1) * char_size;
1114
Victor Stinner7a9105a2011-12-12 00:13:42 +01001115 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1116 {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001117 PyObject_Free(_PyUnicode_UTF8(unicode));
Victor Stinner7a9105a2011-12-12 00:13:42 +01001118 _PyUnicode_UTF8(unicode) = NULL;
1119 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1120 }
1121
Victor Stinner32bd68c2020-12-01 10:37:39 +01001122 data = (PyObject *)PyObject_Realloc(data, new_size);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001123 if (data == NULL) {
1124 PyErr_NoMemory();
1125 return -1;
1126 }
1127 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001128 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001129 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001130 _PyUnicode_WSTR_LENGTH(unicode) = length;
1131 }
1132 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001133 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001134 _PyUnicode_UTF8_LENGTH(unicode) = length;
1135 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001136 _PyUnicode_LENGTH(unicode) = length;
1137 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001138#ifdef Py_DEBUG
1139 unicode_fill_invalid(unicode, old_length);
1140#endif
Victor Stinner95663112011-10-04 01:03:50 +02001141 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001142 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001143 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001144 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001145 }
Victor Stinner95663112011-10-04 01:03:50 +02001146 assert(_PyUnicode_WSTR(unicode) != NULL);
1147
1148 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001149 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001150 PyErr_NoMemory();
1151 return -1;
1152 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001153 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001154 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner32bd68c2020-12-01 10:37:39 +01001155 wstr = PyObject_Realloc(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001156 if (!wstr) {
1157 PyErr_NoMemory();
1158 return -1;
1159 }
1160 _PyUnicode_WSTR(unicode) = wstr;
1161 _PyUnicode_WSTR(unicode)[length] = 0;
1162 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001163 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001164 return 0;
1165}
1166
Victor Stinnerfe226c02011-10-03 03:52:20 +02001167static PyObject*
1168resize_copy(PyObject *unicode, Py_ssize_t length)
1169{
1170 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001171 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001172 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001173
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001174 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001175
1176 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1177 if (copy == NULL)
1178 return NULL;
1179
1180 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001181 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001182 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001183 }
1184 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001185 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001186
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001187 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001188 if (w == NULL)
1189 return NULL;
1190 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1191 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001192 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001193 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001194 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001195 }
1196}
1197
Guido van Rossumd57fd912000-03-10 22:53:23 +00001198/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001199 Ux0000 terminated; some code (e.g. new_identifier)
1200 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001201
1202 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001203 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204
1205*/
1206
Alexander Belopolsky40018472011-02-26 01:02:56 +00001207static PyUnicodeObject *
1208_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001210 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001211 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001212
Thomas Wouters477c8d52006-05-27 19:21:47 +00001213 /* Optimization for empty strings */
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001214 if (length == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001215 return (PyUnicodeObject *)unicode_new_empty();
Guido van Rossumd57fd912000-03-10 22:53:23 +00001216 }
1217
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001218 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001219 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001220 return (PyUnicodeObject *)PyErr_NoMemory();
1221 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001222 if (length < 0) {
1223 PyErr_SetString(PyExc_SystemError,
1224 "Negative size passed to _PyUnicode_New");
1225 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001226 }
1227
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001228 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1229 if (unicode == NULL)
1230 return NULL;
1231 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001232
1233 _PyUnicode_WSTR_LENGTH(unicode) = length;
1234 _PyUnicode_HASH(unicode) = -1;
1235 _PyUnicode_STATE(unicode).interned = 0;
1236 _PyUnicode_STATE(unicode).kind = 0;
1237 _PyUnicode_STATE(unicode).compact = 0;
1238 _PyUnicode_STATE(unicode).ready = 0;
1239 _PyUnicode_STATE(unicode).ascii = 0;
1240 _PyUnicode_DATA_ANY(unicode) = NULL;
1241 _PyUnicode_LENGTH(unicode) = 0;
1242 _PyUnicode_UTF8(unicode) = NULL;
1243 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1244
Victor Stinner32bd68c2020-12-01 10:37:39 +01001245 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_Malloc(new_size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001246 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001247 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001248 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001249 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001250 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251
Jeremy Hyltond8082792003-09-16 19:41:39 +00001252 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001253 * the caller fails before initializing str -- unicode_resize()
1254 * reads str[0], and the Keep-Alive optimization can keep memory
1255 * allocated for str alive across a call to unicode_dealloc(unicode).
1256 * We don't want unicode_resize to read uninitialized memory in
1257 * that case.
1258 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001259 _PyUnicode_WSTR(unicode)[0] = 0;
1260 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001261
Victor Stinner7931d9a2011-11-04 00:22:48 +01001262 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001263 return unicode;
1264}
1265
Victor Stinnerf42dc442011-10-02 23:33:16 +02001266static const char*
1267unicode_kind_name(PyObject *unicode)
1268{
Victor Stinner42dfd712011-10-03 14:41:45 +02001269 /* don't check consistency: unicode_kind_name() is called from
1270 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001271 if (!PyUnicode_IS_COMPACT(unicode))
1272 {
1273 if (!PyUnicode_IS_READY(unicode))
1274 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001275 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001276 {
1277 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001278 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001279 return "legacy ascii";
1280 else
1281 return "legacy latin1";
1282 case PyUnicode_2BYTE_KIND:
1283 return "legacy UCS2";
1284 case PyUnicode_4BYTE_KIND:
1285 return "legacy UCS4";
1286 default:
1287 return "<legacy invalid kind>";
1288 }
1289 }
1290 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001291 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001292 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001293 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001294 return "ascii";
1295 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001296 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001297 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001298 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001299 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001300 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001301 default:
1302 return "<invalid compact kind>";
1303 }
1304}
1305
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001307/* Functions wrapping macros for use in debugger */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001308const char *_PyUnicode_utf8(void *unicode_raw){
Victor Stinnera42de742018-11-22 10:25:22 +01001309 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001310 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311}
1312
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001313const void *_PyUnicode_compact_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001314 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315 return _PyUnicode_COMPACT_DATA(unicode);
1316}
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001317const void *_PyUnicode_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001318 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001319 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1321 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1322 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1323 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1324 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1325 return PyUnicode_DATA(unicode);
1326}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001327
1328void
1329_PyUnicode_Dump(PyObject *op)
1330{
1331 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001332 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1333 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001334 const void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001335
Victor Stinnera849a4b2011-10-03 12:12:11 +02001336 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001337 {
1338 if (ascii->state.ascii)
1339 data = (ascii + 1);
1340 else
1341 data = (compact + 1);
1342 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001343 else
1344 data = unicode->data.any;
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001345 printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001346
Victor Stinnera849a4b2011-10-03 12:12:11 +02001347 if (ascii->wstr == data)
1348 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001349 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001350
Victor Stinnera3b334d2011-10-03 13:53:37 +02001351 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001352 printf(" (%zu), ", compact->wstr_length);
1353 if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001354 printf("shared ");
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001355 }
1356 printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001357 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001358 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001359}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001360#endif
1361
Victor Stinner91698d82020-06-25 14:07:40 +02001362static int
1363unicode_create_empty_string_singleton(struct _Py_unicode_state *state)
1364{
1365 // Use size=1 rather than size=0, so PyUnicode_New(0, maxchar) can be
1366 // optimized to always use state->empty_string without having to check if
1367 // it is NULL or not.
1368 PyObject *empty = PyUnicode_New(1, 0);
1369 if (empty == NULL) {
1370 return -1;
1371 }
1372 PyUnicode_1BYTE_DATA(empty)[0] = 0;
1373 _PyUnicode_LENGTH(empty) = 0;
1374 assert(_PyUnicode_CheckConsistency(empty, 1));
1375
1376 assert(state->empty_string == NULL);
1377 state->empty_string = empty;
1378 return 0;
1379}
1380
1381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001382PyObject *
1383PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1384{
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001385 /* Optimization for empty strings */
1386 if (size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001387 return unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001388 }
1389
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001390 PyObject *obj;
1391 PyCompactUnicodeObject *unicode;
1392 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001393 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001394 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395 Py_ssize_t char_size;
1396 Py_ssize_t struct_size;
1397
Victor Stinner9e9d6892011-10-04 01:02:02 +02001398 is_ascii = 0;
1399 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400 struct_size = sizeof(PyCompactUnicodeObject);
1401 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001402 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001403 char_size = 1;
1404 is_ascii = 1;
1405 struct_size = sizeof(PyASCIIObject);
1406 }
1407 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001408 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 char_size = 1;
1410 }
1411 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001412 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413 char_size = 2;
1414 if (sizeof(wchar_t) == 2)
1415 is_sharing = 1;
1416 }
1417 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001418 if (maxchar > MAX_UNICODE) {
1419 PyErr_SetString(PyExc_SystemError,
1420 "invalid maximum character passed to PyUnicode_New");
1421 return NULL;
1422 }
Victor Stinner8f825062012-04-27 13:55:39 +02001423 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 char_size = 4;
1425 if (sizeof(wchar_t) == 4)
1426 is_sharing = 1;
1427 }
1428
1429 /* Ensure we won't overflow the size. */
1430 if (size < 0) {
1431 PyErr_SetString(PyExc_SystemError,
1432 "Negative size passed to PyUnicode_New");
1433 return NULL;
1434 }
1435 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1436 return PyErr_NoMemory();
1437
1438 /* Duplicated allocation code from _PyObject_New() instead of a call to
1439 * PyObject_New() so we are able to allocate space for the object and
1440 * it's data buffer.
1441 */
Victor Stinner32bd68c2020-12-01 10:37:39 +01001442 obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
Victor Stinner04fc4f22020-06-16 01:28:07 +02001443 if (obj == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001444 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02001445 }
1446 _PyObject_Init(obj, &PyUnicode_Type);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447
1448 unicode = (PyCompactUnicodeObject *)obj;
1449 if (is_ascii)
1450 data = ((PyASCIIObject*)obj) + 1;
1451 else
1452 data = unicode + 1;
1453 _PyUnicode_LENGTH(unicode) = size;
1454 _PyUnicode_HASH(unicode) = -1;
1455 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001456 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 _PyUnicode_STATE(unicode).compact = 1;
1458 _PyUnicode_STATE(unicode).ready = 1;
1459 _PyUnicode_STATE(unicode).ascii = is_ascii;
1460 if (is_ascii) {
1461 ((char*)data)[size] = 0;
1462 _PyUnicode_WSTR(unicode) = NULL;
1463 }
Victor Stinner8f825062012-04-27 13:55:39 +02001464 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001465 ((char*)data)[size] = 0;
1466 _PyUnicode_WSTR(unicode) = NULL;
1467 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001469 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001470 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471 else {
1472 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001473 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001474 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001475 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001476 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001477 ((Py_UCS4*)data)[size] = 0;
1478 if (is_sharing) {
1479 _PyUnicode_WSTR_LENGTH(unicode) = size;
1480 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1481 }
1482 else {
1483 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1484 _PyUnicode_WSTR(unicode) = NULL;
1485 }
1486 }
Victor Stinner8f825062012-04-27 13:55:39 +02001487#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001488 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001489#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001490 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001491 return obj;
1492}
1493
1494#if SIZEOF_WCHAR_T == 2
1495/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1496 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001497 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001498
1499 This function assumes that unicode can hold one more code point than wstr
1500 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001501static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001502unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001503 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504{
1505 const wchar_t *iter;
1506 Py_UCS4 *ucs4_out;
1507
Victor Stinner910337b2011-10-03 03:20:16 +02001508 assert(unicode != NULL);
1509 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001510 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1511 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1512
1513 for (iter = begin; iter < end; ) {
1514 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1515 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001516 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1517 && (iter+1) < end
1518 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001519 {
Victor Stinner551ac952011-11-29 22:58:13 +01001520 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001521 iter += 2;
1522 }
1523 else {
1524 *ucs4_out++ = *iter;
1525 iter++;
1526 }
1527 }
1528 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1529 _PyUnicode_GET_LENGTH(unicode)));
1530
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001531}
1532#endif
1533
Victor Stinnercd9950f2011-10-02 00:34:53 +02001534static int
Victor Stinner488fa492011-12-12 00:01:39 +01001535unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001536{
Victor Stinner488fa492011-12-12 00:01:39 +01001537 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001538 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001539 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001540 return -1;
1541 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001542 return 0;
1543}
1544
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001545static int
1546_copy_characters(PyObject *to, Py_ssize_t to_start,
1547 PyObject *from, Py_ssize_t from_start,
1548 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001549{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001550 unsigned int from_kind, to_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001551 const void *from_data;
1552 void *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001553
Victor Stinneree4544c2012-05-09 22:24:08 +02001554 assert(0 <= how_many);
1555 assert(0 <= from_start);
1556 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001557 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001558 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001559 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001560
Victor Stinnerd3f08822012-05-29 12:57:52 +02001561 assert(PyUnicode_Check(to));
1562 assert(PyUnicode_IS_READY(to));
1563 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1564
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001565 if (how_many == 0)
1566 return 0;
1567
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001568 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001569 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001570 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001571 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001572
Victor Stinnerf1852262012-06-16 16:38:26 +02001573#ifdef Py_DEBUG
1574 if (!check_maxchar
1575 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1576 {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001577 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerf1852262012-06-16 16:38:26 +02001578 Py_UCS4 ch;
1579 Py_ssize_t i;
1580 for (i=0; i < how_many; i++) {
1581 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1582 assert(ch <= to_maxchar);
1583 }
1584 }
1585#endif
1586
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001587 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001588 if (check_maxchar
1589 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1590 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001591 /* Writing Latin-1 characters into an ASCII string requires to
1592 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001593 Py_UCS4 max_char;
1594 max_char = ucs1lib_find_max_char(from_data,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001595 (const Py_UCS1*)from_data + how_many);
Victor Stinnerf1852262012-06-16 16:38:26 +02001596 if (max_char >= 128)
1597 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001598 }
Christian Heimesf051e432016-09-13 20:22:02 +02001599 memcpy((char*)to_data + to_kind * to_start,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001600 (const char*)from_data + from_kind * from_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001601 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001602 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001603 else if (from_kind == PyUnicode_1BYTE_KIND
1604 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001605 {
1606 _PyUnicode_CONVERT_BYTES(
1607 Py_UCS1, Py_UCS2,
1608 PyUnicode_1BYTE_DATA(from) + from_start,
1609 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1610 PyUnicode_2BYTE_DATA(to) + to_start
1611 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001612 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001613 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001614 && to_kind == PyUnicode_4BYTE_KIND)
1615 {
1616 _PyUnicode_CONVERT_BYTES(
1617 Py_UCS1, Py_UCS4,
1618 PyUnicode_1BYTE_DATA(from) + from_start,
1619 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1620 PyUnicode_4BYTE_DATA(to) + to_start
1621 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001622 }
1623 else if (from_kind == PyUnicode_2BYTE_KIND
1624 && to_kind == PyUnicode_4BYTE_KIND)
1625 {
1626 _PyUnicode_CONVERT_BYTES(
1627 Py_UCS2, Py_UCS4,
1628 PyUnicode_2BYTE_DATA(from) + from_start,
1629 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1630 PyUnicode_4BYTE_DATA(to) + to_start
1631 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001632 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001633 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001634 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1635
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001636 if (!check_maxchar) {
1637 if (from_kind == PyUnicode_2BYTE_KIND
1638 && to_kind == PyUnicode_1BYTE_KIND)
1639 {
1640 _PyUnicode_CONVERT_BYTES(
1641 Py_UCS2, Py_UCS1,
1642 PyUnicode_2BYTE_DATA(from) + from_start,
1643 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1644 PyUnicode_1BYTE_DATA(to) + to_start
1645 );
1646 }
1647 else if (from_kind == PyUnicode_4BYTE_KIND
1648 && to_kind == PyUnicode_1BYTE_KIND)
1649 {
1650 _PyUnicode_CONVERT_BYTES(
1651 Py_UCS4, Py_UCS1,
1652 PyUnicode_4BYTE_DATA(from) + from_start,
1653 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1654 PyUnicode_1BYTE_DATA(to) + to_start
1655 );
1656 }
1657 else if (from_kind == PyUnicode_4BYTE_KIND
1658 && to_kind == PyUnicode_2BYTE_KIND)
1659 {
1660 _PyUnicode_CONVERT_BYTES(
1661 Py_UCS4, Py_UCS2,
1662 PyUnicode_4BYTE_DATA(from) + from_start,
1663 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1664 PyUnicode_2BYTE_DATA(to) + to_start
1665 );
1666 }
1667 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001668 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001669 }
1670 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001671 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001672 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001673 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001674 Py_ssize_t i;
1675
Victor Stinnera0702ab2011-09-29 14:14:38 +02001676 for (i=0; i < how_many; i++) {
1677 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001678 if (ch > to_maxchar)
1679 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001680 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1681 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001682 }
1683 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001684 return 0;
1685}
1686
Victor Stinnerd3f08822012-05-29 12:57:52 +02001687void
1688_PyUnicode_FastCopyCharacters(
1689 PyObject *to, Py_ssize_t to_start,
1690 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001691{
1692 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1693}
1694
1695Py_ssize_t
1696PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1697 PyObject *from, Py_ssize_t from_start,
1698 Py_ssize_t how_many)
1699{
1700 int err;
1701
1702 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1703 PyErr_BadInternalCall();
1704 return -1;
1705 }
1706
Benjamin Petersonbac79492012-01-14 13:34:47 -05001707 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001708 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001709 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001710 return -1;
1711
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001712 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001713 PyErr_SetString(PyExc_IndexError, "string index out of range");
1714 return -1;
1715 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001716 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001717 PyErr_SetString(PyExc_IndexError, "string index out of range");
1718 return -1;
1719 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001720 if (how_many < 0) {
1721 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1722 return -1;
1723 }
1724 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001725 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1726 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001727 "Cannot write %zi characters at %zi "
1728 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001729 how_many, to_start, PyUnicode_GET_LENGTH(to));
1730 return -1;
1731 }
1732
1733 if (how_many == 0)
1734 return 0;
1735
Victor Stinner488fa492011-12-12 00:01:39 +01001736 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001737 return -1;
1738
1739 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1740 if (err) {
1741 PyErr_Format(PyExc_SystemError,
1742 "Cannot copy %s characters "
1743 "into a string of %s characters",
1744 unicode_kind_name(from),
1745 unicode_kind_name(to));
1746 return -1;
1747 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001748 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749}
1750
Victor Stinner17222162011-09-28 22:15:37 +02001751/* Find the maximum code point and count the number of surrogate pairs so a
1752 correct string length can be computed before converting a string to UCS4.
1753 This function counts single surrogates as a character and not as a pair.
1754
1755 Return 0 on success, or -1 on error. */
1756static int
1757find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1758 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001759{
1760 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001761 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762
Victor Stinnerc53be962011-10-02 21:33:54 +02001763 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001764 *num_surrogates = 0;
1765 *maxchar = 0;
1766
1767 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001769 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1770 && (iter+1) < end
1771 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1772 {
1773 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1774 ++(*num_surrogates);
1775 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001776 }
1777 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001778#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001779 {
1780 ch = *iter;
1781 iter++;
1782 }
1783 if (ch > *maxchar) {
1784 *maxchar = ch;
1785 if (*maxchar > MAX_UNICODE) {
1786 PyErr_Format(PyExc_ValueError,
1787 "character U+%x is not in range [U+0000; U+10ffff]",
1788 ch);
1789 return -1;
1790 }
1791 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001792 }
1793 return 0;
1794}
1795
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001796int
1797_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001798{
1799 wchar_t *end;
1800 Py_UCS4 maxchar = 0;
1801 Py_ssize_t num_surrogates;
1802#if SIZEOF_WCHAR_T == 2
1803 Py_ssize_t length_wo_surrogates;
1804#endif
1805
Georg Brandl7597add2011-10-05 16:36:47 +02001806 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001807 strings were created using _PyObject_New() and where no canonical
1808 representation (the str field) has been set yet aka strings
1809 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001810 assert(_PyUnicode_CHECK(unicode));
1811 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001812 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001813 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001814 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001815 /* Actually, it should neither be interned nor be anything else: */
1816 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001818 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001819 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001820 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001821 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001822
1823 if (maxchar < 256) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001824 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(_PyUnicode_WSTR_LENGTH(unicode) + 1);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001825 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001826 PyErr_NoMemory();
1827 return -1;
1828 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001829 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001830 _PyUnicode_WSTR(unicode), end,
1831 PyUnicode_1BYTE_DATA(unicode));
1832 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1833 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1834 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1835 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001836 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001837 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001838 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839 }
1840 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001841 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001842 _PyUnicode_UTF8(unicode) = NULL;
1843 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001844 }
Victor Stinner32bd68c2020-12-01 10:37:39 +01001845 PyObject_Free(_PyUnicode_WSTR(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001846 _PyUnicode_WSTR(unicode) = NULL;
1847 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1848 }
1849 /* In this case we might have to convert down from 4-byte native
1850 wchar_t to 2-byte unicode. */
1851 else if (maxchar < 65536) {
1852 assert(num_surrogates == 0 &&
1853 "FindMaxCharAndNumSurrogatePairs() messed up");
1854
Victor Stinner506f5922011-09-28 22:34:18 +02001855#if SIZEOF_WCHAR_T == 2
1856 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001857 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001858 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1859 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1860 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001861 _PyUnicode_UTF8(unicode) = NULL;
1862 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001863#else
1864 /* sizeof(wchar_t) == 4 */
Victor Stinner32bd68c2020-12-01 10:37:39 +01001865 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(
Victor Stinner506f5922011-09-28 22:34:18 +02001866 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001867 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001868 PyErr_NoMemory();
1869 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001870 }
Victor Stinner506f5922011-09-28 22:34:18 +02001871 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1872 _PyUnicode_WSTR(unicode), end,
1873 PyUnicode_2BYTE_DATA(unicode));
1874 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1875 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1876 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001877 _PyUnicode_UTF8(unicode) = NULL;
1878 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner32bd68c2020-12-01 10:37:39 +01001879 PyObject_Free(_PyUnicode_WSTR(unicode));
Victor Stinner506f5922011-09-28 22:34:18 +02001880 _PyUnicode_WSTR(unicode) = NULL;
1881 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1882#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001883 }
Ikko Ashimine38811d62020-11-10 14:57:34 +09001884 /* maxchar exceeds 16 bit, wee need 4 bytes for unicode characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001885 else {
1886#if SIZEOF_WCHAR_T == 2
1887 /* in case the native representation is 2-bytes, we need to allocate a
1888 new normalized 4-byte version. */
1889 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001890 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1891 PyErr_NoMemory();
1892 return -1;
1893 }
Victor Stinner32bd68c2020-12-01 10:37:39 +01001894 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(4 * (length_wo_surrogates + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001895 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001896 PyErr_NoMemory();
1897 return -1;
1898 }
1899 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1900 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001901 _PyUnicode_UTF8(unicode) = NULL;
1902 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001903 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1904 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001905 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Victor Stinner32bd68c2020-12-01 10:37:39 +01001906 PyObject_Free(_PyUnicode_WSTR(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001907 _PyUnicode_WSTR(unicode) = NULL;
1908 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1909#else
1910 assert(num_surrogates == 0);
1911
Victor Stinnerc3c74152011-10-02 20:39:55 +02001912 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001913 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001914 _PyUnicode_UTF8(unicode) = NULL;
1915 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001916 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1917#endif
1918 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1919 }
1920 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001921 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001922 return 0;
1923}
1924
Alexander Belopolsky40018472011-02-26 01:02:56 +00001925static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001926unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001927{
Walter Dörwald16807132007-05-25 13:52:07 +00001928 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001929 case SSTATE_NOT_INTERNED:
1930 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001931
Benjamin Peterson29060642009-01-31 22:14:21 +00001932 case SSTATE_INTERNED_MORTAL:
Victor Stinnerea251802020-12-26 02:58:33 +01001933 {
1934 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner3549ca32020-07-03 16:59:12 +02001935 /* Revive the dead object temporarily. PyDict_DelItem() removes two
1936 references (key and value) which were ignored by
1937 PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2
1938 to prevent calling unicode_dealloc() again. Adjust refcnt after
1939 PyDict_DelItem(). */
1940 assert(Py_REFCNT(unicode) == 0);
1941 Py_SET_REFCNT(unicode, 3);
Victor Stinnerea251802020-12-26 02:58:33 +01001942 if (PyDict_DelItem(state->interned, unicode) != 0) {
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001943 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1944 NULL);
1945 }
Victor Stinner3549ca32020-07-03 16:59:12 +02001946 assert(Py_REFCNT(unicode) == 1);
1947 Py_SET_REFCNT(unicode, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001948 break;
Victor Stinnerea251802020-12-26 02:58:33 +01001949 }
Walter Dörwald16807132007-05-25 13:52:07 +00001950
Benjamin Peterson29060642009-01-31 22:14:21 +00001951 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001952 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1953 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001954
Benjamin Peterson29060642009-01-31 22:14:21 +00001955 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001956 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001957 }
1958
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001959 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001960 PyObject_Free(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001961 }
1962 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001963 PyObject_Free(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001964 }
1965 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001966 PyObject_Free(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001967 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001968
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001969 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001970}
1971
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001972#ifdef Py_DEBUG
1973static int
1974unicode_is_singleton(PyObject *unicode)
1975{
Victor Stinner2f9ada92020-06-24 02:22:21 +02001976 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner91698d82020-06-25 14:07:40 +02001977 if (unicode == state->empty_string) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001978 return 1;
Victor Stinner607b1022020-05-05 18:50:30 +02001979 }
Victor Stinner607b1022020-05-05 18:50:30 +02001980 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001981 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1982 {
1983 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02001984 if (ch < 256 && state->latin1[ch] == unicode) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001985 return 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02001986 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001987 }
1988 return 0;
1989}
1990#endif
1991
Alexander Belopolsky40018472011-02-26 01:02:56 +00001992static int
Victor Stinner488fa492011-12-12 00:01:39 +01001993unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001994{
Victor Stinner488fa492011-12-12 00:01:39 +01001995 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001996 if (Py_REFCNT(unicode) != 1)
1997 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001998 if (_PyUnicode_HASH(unicode) != -1)
1999 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002000 if (PyUnicode_CHECK_INTERNED(unicode))
2001 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002002 if (!PyUnicode_CheckExact(unicode))
2003 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02002004#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002005 /* singleton refcount is greater than 1 */
2006 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02002007#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02002008 return 1;
2009}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002010
Victor Stinnerfe226c02011-10-03 03:52:20 +02002011static int
2012unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2013{
2014 PyObject *unicode;
2015 Py_ssize_t old_length;
2016
2017 assert(p_unicode != NULL);
2018 unicode = *p_unicode;
2019
2020 assert(unicode != NULL);
2021 assert(PyUnicode_Check(unicode));
2022 assert(0 <= length);
2023
Victor Stinner910337b2011-10-03 03:20:16 +02002024 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002025 old_length = PyUnicode_WSTR_LENGTH(unicode);
2026 else
2027 old_length = PyUnicode_GET_LENGTH(unicode);
2028 if (old_length == length)
2029 return 0;
2030
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002031 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002032 PyObject *empty = unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002033 Py_SETREF(*p_unicode, empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002034 return 0;
2035 }
2036
Victor Stinner488fa492011-12-12 00:01:39 +01002037 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002038 PyObject *copy = resize_copy(unicode, length);
2039 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002040 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002041 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002042 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002043 }
2044
Victor Stinnerfe226c02011-10-03 03:52:20 +02002045 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002046 PyObject *new_unicode = resize_compact(unicode, length);
2047 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002048 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002049 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002050 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002051 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002052 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002053}
2054
Alexander Belopolsky40018472011-02-26 01:02:56 +00002055int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002056PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002057{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002058 PyObject *unicode;
2059 if (p_unicode == NULL) {
2060 PyErr_BadInternalCall();
2061 return -1;
2062 }
2063 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002064 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002065 {
2066 PyErr_BadInternalCall();
2067 return -1;
2068 }
2069 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002070}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002071
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002072/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002073
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002074 WARNING: The function doesn't copy the terminating null character and
2075 doesn't check the maximum character (may write a latin1 character in an
2076 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002077static void
2078unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2079 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002080{
2081 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002082 const void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002083 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002084
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002085 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002086 switch (kind) {
2087 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002088#ifdef Py_DEBUG
2089 if (PyUnicode_IS_ASCII(unicode)) {
2090 Py_UCS4 maxchar = ucs1lib_find_max_char(
2091 (const Py_UCS1*)str,
2092 (const Py_UCS1*)str + len);
2093 assert(maxchar < 128);
2094 }
2095#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002096 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002097 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002098 }
2099 case PyUnicode_2BYTE_KIND: {
2100 Py_UCS2 *start = (Py_UCS2 *)data + index;
2101 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002102
Victor Stinner184252a2012-06-16 02:57:41 +02002103 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002104 *ucs2 = (Py_UCS2)*str;
2105
2106 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002107 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002108 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002109 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002110 Py_UCS4 *start = (Py_UCS4 *)data + index;
2111 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002112
Victor Stinner184252a2012-06-16 02:57:41 +02002113 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002114 *ucs4 = (Py_UCS4)*str;
2115
2116 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002117 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002118 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002119 default:
2120 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002121 }
2122}
2123
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002124static PyObject*
Victor Stinner2f9ada92020-06-24 02:22:21 +02002125get_latin1_char(Py_UCS1 ch)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002126{
Victor Stinner2f9ada92020-06-24 02:22:21 +02002127 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner607b1022020-05-05 18:50:30 +02002128
Victor Stinner2f9ada92020-06-24 02:22:21 +02002129 PyObject *unicode = state->latin1[ch];
Victor Stinner607b1022020-05-05 18:50:30 +02002130 if (unicode) {
2131 Py_INCREF(unicode);
2132 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002133 }
Victor Stinner607b1022020-05-05 18:50:30 +02002134
2135 unicode = PyUnicode_New(1, ch);
2136 if (!unicode) {
2137 return NULL;
2138 }
2139
2140 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2141 assert(_PyUnicode_CheckConsistency(unicode, 1));
2142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002143 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002144 state->latin1[ch] = unicode;
Victor Stinnera464fc12011-10-02 20:39:30 +02002145 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002146}
2147
Victor Stinner985a82a2014-01-03 12:53:47 +01002148static PyObject*
2149unicode_char(Py_UCS4 ch)
2150{
2151 PyObject *unicode;
2152
2153 assert(ch <= MAX_UNICODE);
2154
Victor Stinner2f9ada92020-06-24 02:22:21 +02002155 if (ch < 256) {
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002156 return get_latin1_char(ch);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002157 }
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002158
Victor Stinner985a82a2014-01-03 12:53:47 +01002159 unicode = PyUnicode_New(1, ch);
2160 if (unicode == NULL)
2161 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002162
2163 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2164 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002165 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002166 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002167 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2168 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2169 }
2170 assert(_PyUnicode_CheckConsistency(unicode, 1));
2171 return unicode;
2172}
2173
Alexander Belopolsky40018472011-02-26 01:02:56 +00002174PyObject *
2175PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002176{
Inada Naoki038dd0f2020-06-30 15:26:56 +09002177 if (u == NULL) {
2178 if (size > 0) {
2179 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2180 "PyUnicode_FromUnicode(NULL, size) is deprecated; "
2181 "use PyUnicode_New() instead", 1) < 0) {
2182 return NULL;
2183 }
2184 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002185 return (PyObject*)_PyUnicode_New(size);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002186 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002187
2188 if (size < 0) {
2189 PyErr_BadInternalCall();
2190 return NULL;
2191 }
2192
2193 return PyUnicode_FromWideChar(u, size);
2194}
2195
2196PyObject *
2197PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2198{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002199 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002200 Py_UCS4 maxchar = 0;
2201 Py_ssize_t num_surrogates;
2202
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002203 if (u == NULL && size != 0) {
2204 PyErr_BadInternalCall();
2205 return NULL;
2206 }
2207
2208 if (size == -1) {
2209 size = wcslen(u);
2210 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002212 /* If the Unicode data is known at construction time, we can apply
2213 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002214
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002215 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002216 if (size == 0)
2217 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002219 /* Single character Unicode objects in the Latin-1 range are
2220 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002221 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002222 return get_latin1_char((unsigned char)*u);
2223
2224 /* If not empty and not single character, copy the Unicode data
2225 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002226 if (find_maxchar_surrogates(u, u + size,
2227 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002228 return NULL;
2229
Victor Stinner8faf8212011-12-08 22:14:11 +01002230 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002231 if (!unicode)
2232 return NULL;
2233
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002234 switch (PyUnicode_KIND(unicode)) {
2235 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002236 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002237 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2238 break;
2239 case PyUnicode_2BYTE_KIND:
2240#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002241 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002242#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002243 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002244 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2245#endif
2246 break;
2247 case PyUnicode_4BYTE_KIND:
2248#if SIZEOF_WCHAR_T == 2
2249 /* This is the only case which has to process surrogates, thus
2250 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002251 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252#else
2253 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002254 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002255#endif
2256 break;
2257 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002258 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002259 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002260
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002261 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002262}
2263
Alexander Belopolsky40018472011-02-26 01:02:56 +00002264PyObject *
2265PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002266{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002267 if (size < 0) {
2268 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002269 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002270 return NULL;
2271 }
Inada Naoki038dd0f2020-06-30 15:26:56 +09002272 if (u != NULL) {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002273 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002274 }
2275 else {
2276 if (size > 0) {
2277 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2278 "PyUnicode_FromStringAndSize(NULL, size) is deprecated; "
2279 "use PyUnicode_New() instead", 1) < 0) {
2280 return NULL;
2281 }
2282 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002283 return (PyObject *)_PyUnicode_New(size);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002284 }
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002285}
2286
Alexander Belopolsky40018472011-02-26 01:02:56 +00002287PyObject *
2288PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002289{
2290 size_t size = strlen(u);
2291 if (size > PY_SSIZE_T_MAX) {
2292 PyErr_SetString(PyExc_OverflowError, "input too long");
2293 return NULL;
2294 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002295 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002296}
2297
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002298
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002299PyObject *
2300_PyUnicode_FromId(_Py_Identifier *id)
2301{
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002302 PyInterpreterState *interp = _PyInterpreterState_GET();
2303 struct _Py_unicode_ids *ids = &interp->unicode.ids;
2304
2305 int index = _Py_atomic_size_get(&id->index);
2306 if (index < 0) {
2307 struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_ids;
2308
2309 PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK);
2310 // Check again to detect concurrent access. Another thread can have
2311 // initialized the index while this thread waited for the lock.
2312 index = _Py_atomic_size_get(&id->index);
2313 if (index < 0) {
2314 assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2315 index = rt_ids->next_index;
2316 rt_ids->next_index++;
2317 _Py_atomic_size_set(&id->index, index);
2318 }
2319 PyThread_release_lock(rt_ids->lock);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002320 }
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002321 assert(index >= 0);
Victor Stinner297257f2020-06-02 14:39:45 +02002322
2323 PyObject *obj;
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002324 if (index < ids->size) {
2325 obj = ids->array[index];
2326 if (obj) {
2327 // Return a borrowed reference
2328 return obj;
2329 }
2330 }
2331
2332 obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
Victor Stinner297257f2020-06-02 14:39:45 +02002333 NULL, NULL);
2334 if (!obj) {
2335 return NULL;
2336 }
2337 PyUnicode_InternInPlace(&obj);
2338
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002339 if (index >= ids->size) {
2340 // Overallocate to reduce the number of realloc
2341 Py_ssize_t new_size = Py_MAX(index * 2, 16);
2342 Py_ssize_t item_size = sizeof(ids->array[0]);
2343 PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2344 if (new_array == NULL) {
2345 PyErr_NoMemory();
2346 return NULL;
2347 }
2348 memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2349 ids->array = new_array;
2350 ids->size = new_size;
2351 }
2352
2353 // The array stores a strong reference
2354 ids->array[index] = obj;
2355
2356 // Return a borrowed reference
2357 return obj;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002358}
2359
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002360
Victor Stinnerd6fb53f2020-05-14 01:11:54 +02002361static void
Victor Stinnerf4507232020-12-26 20:26:08 +01002362unicode_clear_identifiers(struct _Py_unicode_state *state)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002363{
Victor Stinnerf4507232020-12-26 20:26:08 +01002364 struct _Py_unicode_ids *ids = &state->ids;
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002365 for (Py_ssize_t i=0; i < ids->size; i++) {
2366 Py_XDECREF(ids->array[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002367 }
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002368 ids->size = 0;
2369 PyMem_Free(ids->array);
2370 ids->array = NULL;
2371 // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2372 // after Py_Finalize().
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002373}
2374
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002375
Benjamin Peterson0df54292012-03-26 14:50:32 -04002376/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002377
Victor Stinnerd3f08822012-05-29 12:57:52 +02002378PyObject*
2379_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002380{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002381 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002382 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002383 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002384#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002385 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002386#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002387 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002388 }
Victor Stinner785938e2011-12-11 20:09:03 +01002389 unicode = PyUnicode_New(size, 127);
2390 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002391 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002392 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2393 assert(_PyUnicode_CheckConsistency(unicode, 1));
2394 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002395}
2396
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002397static Py_UCS4
2398kind_maxchar_limit(unsigned int kind)
2399{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002400 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002401 case PyUnicode_1BYTE_KIND:
2402 return 0x80;
2403 case PyUnicode_2BYTE_KIND:
2404 return 0x100;
2405 case PyUnicode_4BYTE_KIND:
2406 return 0x10000;
2407 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002408 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002409 }
2410}
2411
Victor Stinner702c7342011-10-05 13:50:52 +02002412static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002413_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002414{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002415 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002416 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002417
Victor Stinner2f9ada92020-06-24 02:22:21 +02002418 if (size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02002419 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner2f9ada92020-06-24 02:22:21 +02002420 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002421 assert(size > 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002422 if (size == 1) {
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002423 return get_latin1_char(u[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002424 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002425
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002426 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002427 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002428 if (!res)
2429 return NULL;
2430 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002431 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002432 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002433}
2434
Victor Stinnere57b1c02011-09-28 22:20:48 +02002435static PyObject*
2436_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002437{
2438 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002439 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002440
Serhiy Storchaka678db842013-01-26 12:16:36 +02002441 if (size == 0)
2442 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002443 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002444 if (size == 1)
2445 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002446
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002447 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002448 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002449 if (!res)
2450 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002451 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002452 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002453 else {
2454 _PyUnicode_CONVERT_BYTES(
2455 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2456 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002457 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002458 return res;
2459}
2460
Victor Stinnere57b1c02011-09-28 22:20:48 +02002461static PyObject*
2462_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002463{
2464 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002465 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002466
Serhiy Storchaka678db842013-01-26 12:16:36 +02002467 if (size == 0)
2468 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002469 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002470 if (size == 1)
2471 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002472
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002473 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002474 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002475 if (!res)
2476 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002477 if (max_char < 256)
2478 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2479 PyUnicode_1BYTE_DATA(res));
2480 else if (max_char < 0x10000)
2481 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2482 PyUnicode_2BYTE_DATA(res));
2483 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002484 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002485 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002486 return res;
2487}
2488
2489PyObject*
2490PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2491{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002492 if (size < 0) {
2493 PyErr_SetString(PyExc_ValueError, "size must be positive");
2494 return NULL;
2495 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002496 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002497 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002498 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002499 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002500 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002501 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002502 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002503 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002504 PyErr_SetString(PyExc_SystemError, "invalid kind");
2505 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002506 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002507}
2508
Victor Stinnerece58de2012-04-23 23:36:38 +02002509Py_UCS4
2510_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2511{
2512 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002513 const void *startptr, *endptr;
Victor Stinnerece58de2012-04-23 23:36:38 +02002514
2515 assert(PyUnicode_IS_READY(unicode));
2516 assert(0 <= start);
2517 assert(end <= PyUnicode_GET_LENGTH(unicode));
2518 assert(start <= end);
2519
2520 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2521 return PyUnicode_MAX_CHAR_VALUE(unicode);
2522
2523 if (start == end)
2524 return 127;
2525
Victor Stinner94d558b2012-04-27 22:26:58 +02002526 if (PyUnicode_IS_ASCII(unicode))
2527 return 127;
2528
Victor Stinnerece58de2012-04-23 23:36:38 +02002529 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002530 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002531 endptr = (char *)startptr + end * kind;
2532 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002533 switch(kind) {
2534 case PyUnicode_1BYTE_KIND:
2535 return ucs1lib_find_max_char(startptr, endptr);
2536 case PyUnicode_2BYTE_KIND:
2537 return ucs2lib_find_max_char(startptr, endptr);
2538 case PyUnicode_4BYTE_KIND:
2539 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002540 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002541 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002542 }
2543}
2544
Victor Stinner25a4b292011-10-06 12:31:55 +02002545/* Ensure that a string uses the most efficient storage, if it is not the
2546 case: create a new string with of the right kind. Write NULL into *p_unicode
2547 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002548static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002549unicode_adjust_maxchar(PyObject **p_unicode)
2550{
2551 PyObject *unicode, *copy;
2552 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002553 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002554 unsigned int kind;
2555
2556 assert(p_unicode != NULL);
2557 unicode = *p_unicode;
2558 assert(PyUnicode_IS_READY(unicode));
2559 if (PyUnicode_IS_ASCII(unicode))
2560 return;
2561
2562 len = PyUnicode_GET_LENGTH(unicode);
2563 kind = PyUnicode_KIND(unicode);
2564 if (kind == PyUnicode_1BYTE_KIND) {
2565 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002566 max_char = ucs1lib_find_max_char(u, u + len);
2567 if (max_char >= 128)
2568 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002569 }
2570 else if (kind == PyUnicode_2BYTE_KIND) {
2571 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002572 max_char = ucs2lib_find_max_char(u, u + len);
2573 if (max_char >= 256)
2574 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002575 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002576 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002577 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002578 max_char = ucs4lib_find_max_char(u, u + len);
2579 if (max_char >= 0x10000)
2580 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002581 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002582 else
2583 Py_UNREACHABLE();
2584
Victor Stinner25a4b292011-10-06 12:31:55 +02002585 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002586 if (copy != NULL)
2587 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002588 Py_DECREF(unicode);
2589 *p_unicode = copy;
2590}
2591
Victor Stinner034f6cf2011-09-30 02:26:44 +02002592PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002593_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002594{
Victor Stinner87af4f22011-11-21 23:03:47 +01002595 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002596 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002597
Victor Stinner034f6cf2011-09-30 02:26:44 +02002598 if (!PyUnicode_Check(unicode)) {
2599 PyErr_BadInternalCall();
2600 return NULL;
2601 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002602 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002603 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002604
Victor Stinner87af4f22011-11-21 23:03:47 +01002605 length = PyUnicode_GET_LENGTH(unicode);
2606 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002607 if (!copy)
2608 return NULL;
2609 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2610
Christian Heimesf051e432016-09-13 20:22:02 +02002611 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002612 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002613 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002614 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002615}
2616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002617
Victor Stinnerbc603d12011-10-02 01:00:40 +02002618/* Widen Unicode objects to larger buffers. Don't write terminating null
2619 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002620
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002621static void*
2622unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002623{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002624 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002625
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002626 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002627 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002628 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002629 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002630 if (!result)
2631 return PyErr_NoMemory();
2632 assert(skind == PyUnicode_1BYTE_KIND);
2633 _PyUnicode_CONVERT_BYTES(
2634 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002635 (const Py_UCS1 *)data,
2636 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002637 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002638 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002639 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002640 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002641 if (!result)
2642 return PyErr_NoMemory();
2643 if (skind == PyUnicode_2BYTE_KIND) {
2644 _PyUnicode_CONVERT_BYTES(
2645 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002646 (const Py_UCS2 *)data,
2647 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002648 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002649 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002650 else {
2651 assert(skind == PyUnicode_1BYTE_KIND);
2652 _PyUnicode_CONVERT_BYTES(
2653 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002654 (const Py_UCS1 *)data,
2655 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002656 result);
2657 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002658 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002659 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002660 Py_UNREACHABLE();
2661 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002662 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002663}
2664
2665static Py_UCS4*
2666as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2667 int copy_null)
2668{
2669 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002670 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002671 Py_ssize_t len, targetlen;
2672 if (PyUnicode_READY(string) == -1)
2673 return NULL;
2674 kind = PyUnicode_KIND(string);
2675 data = PyUnicode_DATA(string);
2676 len = PyUnicode_GET_LENGTH(string);
2677 targetlen = len;
2678 if (copy_null)
2679 targetlen++;
2680 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002681 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002682 if (!target) {
2683 PyErr_NoMemory();
2684 return NULL;
2685 }
2686 }
2687 else {
2688 if (targetsize < targetlen) {
2689 PyErr_Format(PyExc_SystemError,
2690 "string is longer than the buffer");
2691 if (copy_null && 0 < targetsize)
2692 target[0] = 0;
2693 return NULL;
2694 }
2695 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002696 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002697 const Py_UCS1 *start = (const Py_UCS1 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002698 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002699 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002700 else if (kind == PyUnicode_2BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002701 const Py_UCS2 *start = (const Py_UCS2 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002702 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2703 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002704 else if (kind == PyUnicode_4BYTE_KIND) {
Christian Heimesf051e432016-09-13 20:22:02 +02002705 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002706 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002707 else {
2708 Py_UNREACHABLE();
2709 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002710 if (copy_null)
2711 target[len] = 0;
2712 return target;
2713}
2714
2715Py_UCS4*
2716PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2717 int copy_null)
2718{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002719 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002720 PyErr_BadInternalCall();
2721 return NULL;
2722 }
2723 return as_ucs4(string, target, targetsize, copy_null);
2724}
2725
2726Py_UCS4*
2727PyUnicode_AsUCS4Copy(PyObject *string)
2728{
2729 return as_ucs4(string, NULL, 0, 1);
2730}
2731
Victor Stinner15a11362012-10-06 23:48:20 +02002732/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002733 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2734 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2735#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002736
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002737static int
2738unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2739 Py_ssize_t width, Py_ssize_t precision)
2740{
2741 Py_ssize_t length, fill, arglen;
2742 Py_UCS4 maxchar;
2743
2744 if (PyUnicode_READY(str) == -1)
2745 return -1;
2746
2747 length = PyUnicode_GET_LENGTH(str);
2748 if ((precision == -1 || precision >= length)
2749 && width <= length)
2750 return _PyUnicodeWriter_WriteStr(writer, str);
2751
2752 if (precision != -1)
2753 length = Py_MIN(precision, length);
2754
2755 arglen = Py_MAX(length, width);
2756 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2757 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2758 else
2759 maxchar = writer->maxchar;
2760
2761 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2762 return -1;
2763
2764 if (width > length) {
2765 fill = width - length;
2766 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2767 return -1;
2768 writer->pos += fill;
2769 }
2770
2771 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2772 str, 0, length);
2773 writer->pos += length;
2774 return 0;
2775}
2776
2777static int
Victor Stinner998b8062018-09-12 00:23:25 +02002778unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002779 Py_ssize_t width, Py_ssize_t precision)
2780{
2781 /* UTF-8 */
2782 Py_ssize_t length;
2783 PyObject *unicode;
2784 int res;
2785
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002786 if (precision == -1) {
2787 length = strlen(str);
2788 }
2789 else {
2790 length = 0;
2791 while (length < precision && str[length]) {
2792 length++;
2793 }
2794 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002795 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2796 if (unicode == NULL)
2797 return -1;
2798
2799 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2800 Py_DECREF(unicode);
2801 return res;
2802}
2803
Victor Stinner96865452011-03-01 23:44:09 +00002804static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002805unicode_fromformat_arg(_PyUnicodeWriter *writer,
2806 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002807{
Victor Stinnere215d962012-10-06 23:03:36 +02002808 const char *p;
2809 Py_ssize_t len;
2810 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002811 Py_ssize_t width;
2812 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002813 int longflag;
2814 int longlongflag;
2815 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002816 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002817
2818 p = f;
2819 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002820 zeropad = 0;
2821 if (*f == '0') {
2822 zeropad = 1;
2823 f++;
2824 }
Victor Stinner96865452011-03-01 23:44:09 +00002825
2826 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002827 width = -1;
2828 if (Py_ISDIGIT((unsigned)*f)) {
2829 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002830 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002831 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002832 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002833 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002834 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002835 return NULL;
2836 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002837 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002838 f++;
2839 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002840 }
2841 precision = -1;
2842 if (*f == '.') {
2843 f++;
2844 if (Py_ISDIGIT((unsigned)*f)) {
2845 precision = (*f - '0');
2846 f++;
2847 while (Py_ISDIGIT((unsigned)*f)) {
2848 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2849 PyErr_SetString(PyExc_ValueError,
2850 "precision too big");
2851 return NULL;
2852 }
2853 precision = (precision * 10) + (*f - '0');
2854 f++;
2855 }
2856 }
Victor Stinner96865452011-03-01 23:44:09 +00002857 if (*f == '%') {
2858 /* "%.3%s" => f points to "3" */
2859 f--;
2860 }
2861 }
2862 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002863 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002864 f--;
2865 }
Victor Stinner96865452011-03-01 23:44:09 +00002866
2867 /* Handle %ld, %lu, %lld and %llu. */
2868 longflag = 0;
2869 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002870 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002871 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002872 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002873 longflag = 1;
2874 ++f;
2875 }
Victor Stinner96865452011-03-01 23:44:09 +00002876 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002877 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002878 longlongflag = 1;
2879 f += 2;
2880 }
Victor Stinner96865452011-03-01 23:44:09 +00002881 }
2882 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002883 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002884 size_tflag = 1;
2885 ++f;
2886 }
Victor Stinnere215d962012-10-06 23:03:36 +02002887
2888 if (f[1] == '\0')
2889 writer->overallocate = 0;
2890
2891 switch (*f) {
2892 case 'c':
2893 {
2894 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002895 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002896 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002897 "character argument not in range(0x110000)");
2898 return NULL;
2899 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002900 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002901 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002902 break;
2903 }
2904
2905 case 'i':
2906 case 'd':
2907 case 'u':
2908 case 'x':
2909 {
2910 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002911 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002912 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002913
2914 if (*f == 'u') {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002915 if (longflag) {
2916 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2917 }
2918 else if (longlongflag) {
2919 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2920 }
2921 else if (size_tflag) {
2922 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2923 }
2924 else {
2925 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2926 }
Victor Stinnere215d962012-10-06 23:03:36 +02002927 }
2928 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002929 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002930 }
2931 else {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002932 if (longflag) {
2933 len = sprintf(buffer, "%li", va_arg(*vargs, long));
2934 }
2935 else if (longlongflag) {
2936 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2937 }
2938 else if (size_tflag) {
2939 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2940 }
2941 else {
2942 len = sprintf(buffer, "%i", va_arg(*vargs, int));
2943 }
Victor Stinnere215d962012-10-06 23:03:36 +02002944 }
2945 assert(len >= 0);
2946
Victor Stinnere215d962012-10-06 23:03:36 +02002947 if (precision < len)
2948 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002949
2950 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002951 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2952 return NULL;
2953
Victor Stinnere215d962012-10-06 23:03:36 +02002954 if (width > precision) {
2955 Py_UCS4 fillchar;
2956 fill = width - precision;
2957 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002958 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2959 return NULL;
2960 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002961 }
Victor Stinner15a11362012-10-06 23:48:20 +02002962 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002963 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002964 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2965 return NULL;
2966 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002967 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002968
Victor Stinner4a587072013-11-19 12:54:53 +01002969 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2970 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002971 break;
2972 }
2973
2974 case 'p':
2975 {
2976 char number[MAX_LONG_LONG_CHARS];
2977
2978 len = sprintf(number, "%p", va_arg(*vargs, void*));
2979 assert(len >= 0);
2980
2981 /* %p is ill-defined: ensure leading 0x. */
2982 if (number[1] == 'X')
2983 number[1] = 'x';
2984 else if (number[1] != 'x') {
2985 memmove(number + 2, number,
2986 strlen(number) + 1);
2987 number[0] = '0';
2988 number[1] = 'x';
2989 len += 2;
2990 }
2991
Victor Stinner4a587072013-11-19 12:54:53 +01002992 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002993 return NULL;
2994 break;
2995 }
2996
2997 case 's':
2998 {
2999 /* UTF-8 */
3000 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02003001 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003002 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003003 break;
3004 }
3005
3006 case 'U':
3007 {
3008 PyObject *obj = va_arg(*vargs, PyObject *);
3009 assert(obj && _PyUnicode_CHECK(obj));
3010
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003011 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003012 return NULL;
3013 break;
3014 }
3015
3016 case 'V':
3017 {
3018 PyObject *obj = va_arg(*vargs, PyObject *);
3019 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02003020 if (obj) {
3021 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003022 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003023 return NULL;
3024 }
3025 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003026 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02003027 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003028 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003029 }
3030 break;
3031 }
3032
3033 case 'S':
3034 {
3035 PyObject *obj = va_arg(*vargs, PyObject *);
3036 PyObject *str;
3037 assert(obj);
3038 str = PyObject_Str(obj);
3039 if (!str)
3040 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003041 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003042 Py_DECREF(str);
3043 return NULL;
3044 }
3045 Py_DECREF(str);
3046 break;
3047 }
3048
3049 case 'R':
3050 {
3051 PyObject *obj = va_arg(*vargs, PyObject *);
3052 PyObject *repr;
3053 assert(obj);
3054 repr = PyObject_Repr(obj);
3055 if (!repr)
3056 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003057 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003058 Py_DECREF(repr);
3059 return NULL;
3060 }
3061 Py_DECREF(repr);
3062 break;
3063 }
3064
3065 case 'A':
3066 {
3067 PyObject *obj = va_arg(*vargs, PyObject *);
3068 PyObject *ascii;
3069 assert(obj);
3070 ascii = PyObject_ASCII(obj);
3071 if (!ascii)
3072 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003073 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003074 Py_DECREF(ascii);
3075 return NULL;
3076 }
3077 Py_DECREF(ascii);
3078 break;
3079 }
3080
3081 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02003082 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003083 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003084 break;
3085
3086 default:
3087 /* if we stumble upon an unknown formatting code, copy the rest
3088 of the format string to the output string. (we cannot just
3089 skip the code, since there's no way to know what's in the
3090 argument list) */
3091 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01003092 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003093 return NULL;
3094 f = p+len;
3095 return f;
3096 }
3097
3098 f++;
Victor Stinner96865452011-03-01 23:44:09 +00003099 return f;
3100}
3101
Walter Dörwaldd2034312007-05-18 16:29:38 +00003102PyObject *
3103PyUnicode_FromFormatV(const char *format, va_list vargs)
3104{
Victor Stinnere215d962012-10-06 23:03:36 +02003105 va_list vargs2;
3106 const char *f;
3107 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003108
Victor Stinner8f674cc2013-04-17 23:02:17 +02003109 _PyUnicodeWriter_Init(&writer);
3110 writer.min_length = strlen(format) + 100;
3111 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003112
Benjamin Peterson0c212142016-09-20 20:39:33 -07003113 // Copy varags to be able to pass a reference to a subfunction.
3114 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003115
3116 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003117 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003118 f = unicode_fromformat_arg(&writer, f, &vargs2);
3119 if (f == NULL)
3120 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003121 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003122 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003123 const char *p;
3124 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003125
Victor Stinnere215d962012-10-06 23:03:36 +02003126 p = f;
3127 do
3128 {
3129 if ((unsigned char)*p > 127) {
3130 PyErr_Format(PyExc_ValueError,
3131 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3132 "string, got a non-ASCII byte: 0x%02x",
3133 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003134 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003135 }
3136 p++;
3137 }
3138 while (*p != '\0' && *p != '%');
3139 len = p - f;
3140
3141 if (*p == '\0')
3142 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003143
3144 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003145 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003146
3147 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003148 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003149 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003150 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003151 return _PyUnicodeWriter_Finish(&writer);
3152
3153 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003154 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003155 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003156 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003157}
3158
Walter Dörwaldd2034312007-05-18 16:29:38 +00003159PyObject *
3160PyUnicode_FromFormat(const char *format, ...)
3161{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003162 PyObject* ret;
3163 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003164
3165#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003166 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003167#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003168 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003169#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003170 ret = PyUnicode_FromFormatV(format, vargs);
3171 va_end(vargs);
3172 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003173}
3174
Serhiy Storchakac46db922018-10-23 22:58:24 +03003175static Py_ssize_t
3176unicode_get_widechar_size(PyObject *unicode)
3177{
3178 Py_ssize_t res;
3179
3180 assert(unicode != NULL);
3181 assert(_PyUnicode_CHECK(unicode));
3182
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003183#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchakac46db922018-10-23 22:58:24 +03003184 if (_PyUnicode_WSTR(unicode) != NULL) {
3185 return PyUnicode_WSTR_LENGTH(unicode);
3186 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003187#endif /* USE_UNICODE_WCHAR_CACHE */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003188 assert(PyUnicode_IS_READY(unicode));
3189
3190 res = _PyUnicode_LENGTH(unicode);
3191#if SIZEOF_WCHAR_T == 2
3192 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3193 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3194 const Py_UCS4 *end = s + res;
3195 for (; s < end; ++s) {
3196 if (*s > 0xFFFF) {
3197 ++res;
3198 }
3199 }
3200 }
3201#endif
3202 return res;
3203}
3204
3205static void
3206unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3207{
Serhiy Storchakac46db922018-10-23 22:58:24 +03003208 assert(unicode != NULL);
3209 assert(_PyUnicode_CHECK(unicode));
3210
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003211#if USE_UNICODE_WCHAR_CACHE
3212 const wchar_t *wstr = _PyUnicode_WSTR(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03003213 if (wstr != NULL) {
3214 memcpy(w, wstr, size * sizeof(wchar_t));
3215 return;
3216 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003217#else /* USE_UNICODE_WCHAR_CACHE */
3218 if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3219 memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3220 return;
3221 }
3222#endif /* USE_UNICODE_WCHAR_CACHE */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003223 assert(PyUnicode_IS_READY(unicode));
3224
3225 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3226 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3227 for (; size--; ++s, ++w) {
3228 *w = *s;
3229 }
3230 }
3231 else {
3232#if SIZEOF_WCHAR_T == 4
3233 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3234 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3235 for (; size--; ++s, ++w) {
3236 *w = *s;
3237 }
3238#else
3239 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3240 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3241 for (; size--; ++s, ++w) {
3242 Py_UCS4 ch = *s;
3243 if (ch > 0xFFFF) {
3244 assert(ch <= MAX_UNICODE);
3245 /* encode surrogate pair in this case */
3246 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3247 if (!size--)
3248 break;
3249 *w = Py_UNICODE_LOW_SURROGATE(ch);
3250 }
3251 else {
3252 *w = ch;
3253 }
3254 }
3255#endif
3256 }
3257}
3258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003259#ifdef HAVE_WCHAR_H
3260
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003261/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003262
Victor Stinnerd88d9832011-09-06 02:00:05 +02003263 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003264 character) required to convert the unicode object. Ignore size argument.
3265
Victor Stinnerd88d9832011-09-06 02:00:05 +02003266 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003267 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003268 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003269Py_ssize_t
3270PyUnicode_AsWideChar(PyObject *unicode,
3271 wchar_t *w,
3272 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003273{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003274 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003275
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003276 if (unicode == NULL) {
3277 PyErr_BadInternalCall();
3278 return -1;
3279 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003280 if (!PyUnicode_Check(unicode)) {
3281 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003282 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003283 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003284
3285 res = unicode_get_widechar_size(unicode);
3286 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003287 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003288 }
3289
3290 if (size > res) {
3291 size = res + 1;
3292 }
3293 else {
3294 res = size;
3295 }
3296 unicode_copy_as_widechar(unicode, w, size);
3297 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003298}
3299
Victor Stinner137c34c2010-09-29 10:25:54 +00003300wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003301PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003302 Py_ssize_t *size)
3303{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003304 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003305 Py_ssize_t buflen;
3306
3307 if (unicode == NULL) {
3308 PyErr_BadInternalCall();
3309 return NULL;
3310 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003311 if (!PyUnicode_Check(unicode)) {
3312 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003313 return NULL;
3314 }
3315
Serhiy Storchakac46db922018-10-23 22:58:24 +03003316 buflen = unicode_get_widechar_size(unicode);
3317 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003318 if (buffer == NULL) {
3319 PyErr_NoMemory();
3320 return NULL;
3321 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003322 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3323 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003324 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003325 }
3326 else if (wcslen(buffer) != (size_t)buflen) {
Victor Stinner00d7abd2020-12-01 09:56:42 +01003327 PyMem_Free(buffer);
Serhiy Storchakac46db922018-10-23 22:58:24 +03003328 PyErr_SetString(PyExc_ValueError,
3329 "embedded null character");
3330 return NULL;
3331 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003332 return buffer;
3333}
3334
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003335#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003336
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003337int
3338_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3339{
3340 wchar_t **p = (wchar_t **)ptr;
3341 if (obj == NULL) {
3342#if !USE_UNICODE_WCHAR_CACHE
3343 PyMem_Free(*p);
3344#endif /* USE_UNICODE_WCHAR_CACHE */
3345 *p = NULL;
3346 return 1;
3347 }
3348 if (PyUnicode_Check(obj)) {
3349#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003350 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3351 if (*p == NULL) {
3352 return 0;
3353 }
3354 return 1;
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003355#else /* USE_UNICODE_WCHAR_CACHE */
3356 *p = PyUnicode_AsWideCharString(obj, NULL);
3357 if (*p == NULL) {
3358 return 0;
3359 }
3360 return Py_CLEANUP_SUPPORTED;
3361#endif /* USE_UNICODE_WCHAR_CACHE */
3362 }
3363 PyErr_Format(PyExc_TypeError,
3364 "argument must be str, not %.50s",
Victor Stinner8182cc22020-07-10 12:40:38 +02003365 Py_TYPE(obj)->tp_name);
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003366 return 0;
3367}
3368
3369int
3370_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3371{
3372 wchar_t **p = (wchar_t **)ptr;
3373 if (obj == NULL) {
3374#if !USE_UNICODE_WCHAR_CACHE
3375 PyMem_Free(*p);
3376#endif /* USE_UNICODE_WCHAR_CACHE */
3377 *p = NULL;
3378 return 1;
3379 }
3380 if (obj == Py_None) {
3381 *p = NULL;
3382 return 1;
3383 }
3384 if (PyUnicode_Check(obj)) {
3385#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003386 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3387 if (*p == NULL) {
3388 return 0;
3389 }
3390 return 1;
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003391#else /* USE_UNICODE_WCHAR_CACHE */
3392 *p = PyUnicode_AsWideCharString(obj, NULL);
3393 if (*p == NULL) {
3394 return 0;
3395 }
3396 return Py_CLEANUP_SUPPORTED;
3397#endif /* USE_UNICODE_WCHAR_CACHE */
3398 }
3399 PyErr_Format(PyExc_TypeError,
3400 "argument must be str or None, not %.50s",
Victor Stinner8182cc22020-07-10 12:40:38 +02003401 Py_TYPE(obj)->tp_name);
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003402 return 0;
3403}
3404
Alexander Belopolsky40018472011-02-26 01:02:56 +00003405PyObject *
3406PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003407{
Victor Stinner8faf8212011-12-08 22:14:11 +01003408 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003409 PyErr_SetString(PyExc_ValueError,
3410 "chr() arg not in range(0x110000)");
3411 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003412 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003413
Victor Stinner985a82a2014-01-03 12:53:47 +01003414 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003415}
3416
Alexander Belopolsky40018472011-02-26 01:02:56 +00003417PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003418PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003419{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003420 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003421 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003422 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003423 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003424 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003425 Py_INCREF(obj);
3426 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003427 }
3428 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003429 /* For a Unicode subtype that's not a Unicode object,
3430 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003431 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003432 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003433 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003434 "Can't convert '%.100s' object to str implicitly",
3435 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003436 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003437}
3438
Alexander Belopolsky40018472011-02-26 01:02:56 +00003439PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003440PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003441 const char *encoding,
3442 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003443{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003444 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003445 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003446
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003448 PyErr_BadInternalCall();
3449 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003450 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003451
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003452 /* Decoding bytes objects is the most common case and should be fast */
3453 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003454 if (PyBytes_GET_SIZE(obj) == 0) {
3455 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3456 return NULL;
3457 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003458 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003459 }
3460 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003461 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3462 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003463 }
3464
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003465 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003466 PyErr_SetString(PyExc_TypeError,
3467 "decoding str is not supported");
3468 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003469 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003470
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003471 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3472 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3473 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003474 "decoding to str: need a bytes-like object, %.80s found",
3475 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003476 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003477 }
Tim Petersced69f82003-09-16 20:30:58 +00003478
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003479 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003480 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003481 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3482 return NULL;
3483 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003484 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003485 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003486
Serhiy Storchaka05997252013-01-26 12:14:02 +02003487 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003488 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003489 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003490}
3491
Victor Stinnerebe17e02016-10-12 13:57:45 +02003492/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3493 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3494 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003495int
3496_Py_normalize_encoding(const char *encoding,
3497 char *lower,
3498 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003499{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003500 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003501 char *l;
3502 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003503 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003504
Victor Stinner942889a2016-09-05 15:40:10 -07003505 assert(encoding != NULL);
3506
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003507 e = encoding;
3508 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003509 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003510 punct = 0;
3511 while (1) {
3512 char c = *e;
3513 if (c == 0) {
3514 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003515 }
Victor Stinner942889a2016-09-05 15:40:10 -07003516
3517 if (Py_ISALNUM(c) || c == '.') {
3518 if (punct && l != lower) {
3519 if (l == l_end) {
3520 return 0;
3521 }
3522 *l++ = '_';
3523 }
3524 punct = 0;
3525
3526 if (l == l_end) {
3527 return 0;
3528 }
3529 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003530 }
3531 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003532 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003533 }
Victor Stinner942889a2016-09-05 15:40:10 -07003534
3535 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003536 }
3537 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003538 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003539}
3540
Alexander Belopolsky40018472011-02-26 01:02:56 +00003541PyObject *
3542PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003543 Py_ssize_t size,
3544 const char *encoding,
3545 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003546{
3547 PyObject *buffer = NULL, *unicode;
3548 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003549 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3550
Victor Stinner22eb6892019-06-26 00:51:05 +02003551 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3552 return NULL;
3553 }
3554
Victor Stinnered076ed2019-06-26 01:49:32 +02003555 if (size == 0) {
3556 _Py_RETURN_UNICODE_EMPTY();
3557 }
3558
Victor Stinner942889a2016-09-05 15:40:10 -07003559 if (encoding == NULL) {
3560 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3561 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003562
Fred Drakee4315f52000-05-09 19:53:39 +00003563 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003564 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3565 char *lower = buflower;
3566
3567 /* Fast paths */
3568 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3569 lower += 3;
3570 if (*lower == '_') {
3571 /* Match "utf8" and "utf_8" */
3572 lower++;
3573 }
3574
3575 if (lower[0] == '8' && lower[1] == 0) {
3576 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3577 }
3578 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3579 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3580 }
3581 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3582 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3583 }
3584 }
3585 else {
3586 if (strcmp(lower, "ascii") == 0
3587 || strcmp(lower, "us_ascii") == 0) {
3588 return PyUnicode_DecodeASCII(s, size, errors);
3589 }
Steve Dowercc16be82016-09-08 10:35:16 -07003590 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003591 else if (strcmp(lower, "mbcs") == 0) {
3592 return PyUnicode_DecodeMBCS(s, size, errors);
3593 }
3594 #endif
3595 else if (strcmp(lower, "latin1") == 0
3596 || strcmp(lower, "latin_1") == 0
3597 || strcmp(lower, "iso_8859_1") == 0
3598 || strcmp(lower, "iso8859_1") == 0) {
3599 return PyUnicode_DecodeLatin1(s, size, errors);
3600 }
3601 }
Victor Stinner37296e82010-06-10 13:36:23 +00003602 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003603
3604 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003605 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003606 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003607 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003608 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003609 if (buffer == NULL)
3610 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003611 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003612 if (unicode == NULL)
3613 goto onError;
3614 if (!PyUnicode_Check(unicode)) {
3615 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003616 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003617 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003618 encoding,
3619 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003620 Py_DECREF(unicode);
3621 goto onError;
3622 }
3623 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003624 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003625
Benjamin Peterson29060642009-01-31 22:14:21 +00003626 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003627 Py_XDECREF(buffer);
3628 return NULL;
3629}
3630
Alexander Belopolsky40018472011-02-26 01:02:56 +00003631PyObject *
3632PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003633 const char *encoding,
3634 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003635{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003636 if (!PyUnicode_Check(unicode)) {
3637 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003638 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003639 }
3640
Serhiy Storchaka00939072016-10-27 21:05:49 +03003641 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3642 "PyUnicode_AsDecodedObject() is deprecated; "
3643 "use PyCodec_Decode() to decode from str", 1) < 0)
3644 return NULL;
3645
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003646 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003647 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003648
3649 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003650 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003651}
3652
Alexander Belopolsky40018472011-02-26 01:02:56 +00003653PyObject *
3654PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003655 const char *encoding,
3656 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003657{
3658 PyObject *v;
3659
3660 if (!PyUnicode_Check(unicode)) {
3661 PyErr_BadArgument();
3662 goto onError;
3663 }
3664
Serhiy Storchaka00939072016-10-27 21:05:49 +03003665 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3666 "PyUnicode_AsDecodedUnicode() is deprecated; "
3667 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3668 return NULL;
3669
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003670 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003671 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003672
3673 /* Decode via the codec registry */
3674 v = PyCodec_Decode(unicode, encoding, errors);
3675 if (v == NULL)
3676 goto onError;
3677 if (!PyUnicode_Check(v)) {
3678 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003679 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003680 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003681 encoding,
3682 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003683 Py_DECREF(v);
3684 goto onError;
3685 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003686 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003687
Benjamin Peterson29060642009-01-31 22:14:21 +00003688 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003689 return NULL;
3690}
3691
Alexander Belopolsky40018472011-02-26 01:02:56 +00003692PyObject *
3693PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003694 Py_ssize_t size,
3695 const char *encoding,
3696 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003697{
3698 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003699
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003700 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003701 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003702 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003703 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3704 Py_DECREF(unicode);
3705 return v;
3706}
3707
Alexander Belopolsky40018472011-02-26 01:02:56 +00003708PyObject *
3709PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003710 const char *encoding,
3711 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003712{
3713 PyObject *v;
3714
3715 if (!PyUnicode_Check(unicode)) {
3716 PyErr_BadArgument();
3717 goto onError;
3718 }
3719
Serhiy Storchaka00939072016-10-27 21:05:49 +03003720 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3721 "PyUnicode_AsEncodedObject() is deprecated; "
3722 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3723 "or PyCodec_Encode() for generic encoding", 1) < 0)
3724 return NULL;
3725
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003726 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003727 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003728
3729 /* Encode via the codec registry */
3730 v = PyCodec_Encode(unicode, encoding, errors);
3731 if (v == NULL)
3732 goto onError;
3733 return v;
3734
Benjamin Peterson29060642009-01-31 22:14:21 +00003735 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003736 return NULL;
3737}
3738
Victor Stinner1b579672011-12-17 05:47:23 +01003739
Victor Stinner2cba6b82018-01-10 22:46:15 +01003740static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003741unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003742 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003743{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003744 Py_ssize_t wlen;
3745 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3746 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003747 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003748 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003749
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003750 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003751 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003752 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003753 return NULL;
3754 }
3755
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003756 char *str;
3757 size_t error_pos;
3758 const char *reason;
3759 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003760 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003761 PyMem_Free(wstr);
3762
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003763 if (res != 0) {
3764 if (res == -2) {
3765 PyObject *exc;
3766 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3767 "locale", unicode,
3768 (Py_ssize_t)error_pos,
3769 (Py_ssize_t)(error_pos+1),
3770 reason);
3771 if (exc != NULL) {
3772 PyCodec_StrictErrors(exc);
3773 Py_DECREF(exc);
3774 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003775 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003776 else if (res == -3) {
3777 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3778 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003779 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003780 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003781 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003782 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003783 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003784
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003785 PyObject *bytes = PyBytes_FromString(str);
3786 PyMem_RawFree(str);
3787 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003788}
3789
Victor Stinnerad158722010-10-27 00:25:46 +00003790PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003791PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3792{
Victor Stinner709d23d2019-05-02 14:56:30 -04003793 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3794 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003795}
3796
3797PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003798PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003799{
Victor Stinner81a7be32020-04-14 15:14:01 +02003800 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003801 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3802 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003803 return unicode_encode_utf8(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003804 fs_codec->error_handler,
3805 fs_codec->errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003806 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003807#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003808 else if (fs_codec->encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003809 return PyUnicode_AsEncodedString(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003810 fs_codec->encoding,
3811 fs_codec->errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003812 }
Victor Stinnerad158722010-10-27 00:25:46 +00003813#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003814 else {
3815 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3816 machinery is not ready and so cannot be used:
3817 use wcstombs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003818 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3819 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003820 assert(filesystem_errors != NULL);
3821 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3822 assert(errors != _Py_ERROR_UNKNOWN);
3823#ifdef _Py_FORCE_UTF8_FS_ENCODING
3824 return unicode_encode_utf8(unicode, errors, NULL);
3825#else
3826 return unicode_encode_locale(unicode, errors, 0);
3827#endif
3828 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003829}
3830
Alexander Belopolsky40018472011-02-26 01:02:56 +00003831PyObject *
3832PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003833 const char *encoding,
3834 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003835{
3836 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003837 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003838
Guido van Rossumd57fd912000-03-10 22:53:23 +00003839 if (!PyUnicode_Check(unicode)) {
3840 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003841 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003842 }
Fred Drakee4315f52000-05-09 19:53:39 +00003843
Victor Stinner22eb6892019-06-26 00:51:05 +02003844 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3845 return NULL;
3846 }
3847
Victor Stinner942889a2016-09-05 15:40:10 -07003848 if (encoding == NULL) {
3849 return _PyUnicode_AsUTF8String(unicode, errors);
3850 }
3851
Fred Drakee4315f52000-05-09 19:53:39 +00003852 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003853 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3854 char *lower = buflower;
3855
3856 /* Fast paths */
3857 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3858 lower += 3;
3859 if (*lower == '_') {
3860 /* Match "utf8" and "utf_8" */
3861 lower++;
3862 }
3863
3864 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003865 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003866 }
3867 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3868 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3869 }
3870 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3871 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3872 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003873 }
Victor Stinner942889a2016-09-05 15:40:10 -07003874 else {
3875 if (strcmp(lower, "ascii") == 0
3876 || strcmp(lower, "us_ascii") == 0) {
3877 return _PyUnicode_AsASCIIString(unicode, errors);
3878 }
Steve Dowercc16be82016-09-08 10:35:16 -07003879#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003880 else if (strcmp(lower, "mbcs") == 0) {
3881 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3882 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003883#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003884 else if (strcmp(lower, "latin1") == 0 ||
3885 strcmp(lower, "latin_1") == 0 ||
3886 strcmp(lower, "iso_8859_1") == 0 ||
3887 strcmp(lower, "iso8859_1") == 0) {
3888 return _PyUnicode_AsLatin1String(unicode, errors);
3889 }
3890 }
Victor Stinner37296e82010-06-10 13:36:23 +00003891 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003892
3893 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003894 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003895 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003896 return NULL;
3897
3898 /* The normal path */
3899 if (PyBytes_Check(v))
3900 return v;
3901
3902 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003903 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003904 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003905 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003906
3907 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003908 "encoder %s returned bytearray instead of bytes; "
3909 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003910 encoding);
3911 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003912 Py_DECREF(v);
3913 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003914 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003915
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003916 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3917 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003918 Py_DECREF(v);
3919 return b;
3920 }
3921
3922 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003923 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003924 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003925 encoding,
3926 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003927 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003928 return NULL;
3929}
3930
Alexander Belopolsky40018472011-02-26 01:02:56 +00003931PyObject *
3932PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003933 const char *encoding,
3934 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003935{
3936 PyObject *v;
3937
3938 if (!PyUnicode_Check(unicode)) {
3939 PyErr_BadArgument();
3940 goto onError;
3941 }
3942
Serhiy Storchaka00939072016-10-27 21:05:49 +03003943 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3944 "PyUnicode_AsEncodedUnicode() is deprecated; "
3945 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3946 return NULL;
3947
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003948 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003949 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003950
3951 /* Encode via the codec registry */
3952 v = PyCodec_Encode(unicode, encoding, errors);
3953 if (v == NULL)
3954 goto onError;
3955 if (!PyUnicode_Check(v)) {
3956 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003957 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003958 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003959 encoding,
3960 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003961 Py_DECREF(v);
3962 goto onError;
3963 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003964 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003965
Benjamin Peterson29060642009-01-31 22:14:21 +00003966 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003967 return NULL;
3968}
3969
Victor Stinner2cba6b82018-01-10 22:46:15 +01003970static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003971unicode_decode_locale(const char *str, Py_ssize_t len,
3972 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003973{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003974 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3975 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003976 return NULL;
3977 }
3978
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003979 wchar_t *wstr;
3980 size_t wlen;
3981 const char *reason;
3982 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003983 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003984 if (res != 0) {
3985 if (res == -2) {
3986 PyObject *exc;
3987 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3988 "locale", str, len,
3989 (Py_ssize_t)wlen,
3990 (Py_ssize_t)(wlen + 1),
3991 reason);
3992 if (exc != NULL) {
3993 PyCodec_StrictErrors(exc);
3994 Py_DECREF(exc);
3995 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003996 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003997 else if (res == -3) {
3998 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3999 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01004000 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004001 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01004002 }
Victor Stinner2f197072011-12-17 07:08:30 +01004003 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01004004 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004005
4006 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
4007 PyMem_RawFree(wstr);
4008 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01004009}
4010
4011PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01004012PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
4013 const char *errors)
4014{
Victor Stinner709d23d2019-05-02 14:56:30 -04004015 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4016 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01004017}
4018
4019PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01004020PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01004021{
4022 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04004023 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4024 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01004025}
4026
4027
4028PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00004029PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004030 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00004031 return PyUnicode_DecodeFSDefaultAndSize(s, size);
4032}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004033
Christian Heimes5894ba72007-11-04 11:43:14 +00004034PyObject*
4035PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
4036{
Victor Stinner81a7be32020-04-14 15:14:01 +02004037 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02004038 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4039 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04004040 return unicode_decode_utf8(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02004041 fs_codec->error_handler,
4042 fs_codec->errors,
Victor Stinner709d23d2019-05-02 14:56:30 -04004043 NULL);
4044 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004045#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02004046 else if (fs_codec->encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08004047 return PyUnicode_Decode(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02004048 fs_codec->encoding,
4049 fs_codec->errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004050 }
Victor Stinnerad158722010-10-27 00:25:46 +00004051#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004052 else {
4053 /* Before _PyUnicode_InitEncodings() is called, the Python codec
4054 machinery is not ready and so cannot be used:
4055 use mbstowcs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02004056 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4057 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004058 assert(filesystem_errors != NULL);
4059 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4060 assert(errors != _Py_ERROR_UNKNOWN);
4061#ifdef _Py_FORCE_UTF8_FS_ENCODING
4062 return unicode_decode_utf8(s, size, errors, NULL, NULL);
4063#else
4064 return unicode_decode_locale(s, size, errors, 0);
4065#endif
4066 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004067}
4068
Martin v. Löwis011e8422009-05-05 04:43:17 +00004069
4070int
4071PyUnicode_FSConverter(PyObject* arg, void* addr)
4072{
Brett Cannonec6ce872016-09-06 15:50:29 -07004073 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004074 PyObject *output = NULL;
4075 Py_ssize_t size;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004076 const char *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004077 if (arg == NULL) {
4078 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08004079 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004080 return 1;
4081 }
Brett Cannonec6ce872016-09-06 15:50:29 -07004082 path = PyOS_FSPath(arg);
4083 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03004084 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004085 }
Brett Cannonec6ce872016-09-06 15:50:29 -07004086 if (PyBytes_Check(path)) {
4087 output = path;
4088 }
4089 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
4090 output = PyUnicode_EncodeFSDefault(path);
4091 Py_DECREF(path);
4092 if (!output) {
4093 return 0;
4094 }
4095 assert(PyBytes_Check(output));
4096 }
4097
Victor Stinner0ea2a462010-04-30 00:22:08 +00004098 size = PyBytes_GET_SIZE(output);
4099 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02004100 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004101 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00004102 Py_DECREF(output);
4103 return 0;
4104 }
4105 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004106 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004107}
4108
4109
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004110int
4111PyUnicode_FSDecoder(PyObject* arg, void* addr)
4112{
Brett Cannona5711202016-09-06 19:36:01 -07004113 int is_buffer = 0;
4114 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004115 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004116 if (arg == NULL) {
4117 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03004118 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004119 return 1;
4120 }
Brett Cannona5711202016-09-06 19:36:01 -07004121
4122 is_buffer = PyObject_CheckBuffer(arg);
4123 if (!is_buffer) {
4124 path = PyOS_FSPath(arg);
4125 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03004126 return 0;
4127 }
Brett Cannona5711202016-09-06 19:36:01 -07004128 }
4129 else {
4130 path = arg;
4131 Py_INCREF(arg);
4132 }
4133
4134 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07004135 output = path;
4136 }
4137 else if (PyBytes_Check(path) || is_buffer) {
4138 PyObject *path_bytes = NULL;
4139
4140 if (!PyBytes_Check(path) &&
4141 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02004142 "path should be string, bytes, or os.PathLike, not %.200s",
4143 Py_TYPE(arg)->tp_name)) {
4144 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004145 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07004146 }
4147 path_bytes = PyBytes_FromObject(path);
4148 Py_DECREF(path);
4149 if (!path_bytes) {
4150 return 0;
4151 }
4152 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4153 PyBytes_GET_SIZE(path_bytes));
4154 Py_DECREF(path_bytes);
4155 if (!output) {
4156 return 0;
4157 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004158 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004159 else {
4160 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02004161 "path should be string, bytes, or os.PathLike, not %.200s",
4162 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07004163 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004164 return 0;
4165 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004166 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02004167 Py_DECREF(output);
4168 return 0;
4169 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004170 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02004171 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004172 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004173 Py_DECREF(output);
4174 return 0;
4175 }
4176 *(PyObject**)addr = output;
4177 return Py_CLEANUP_SUPPORTED;
4178}
4179
4180
Inada Naoki02a4d572020-02-27 13:48:59 +09004181static int unicode_fill_utf8(PyObject *unicode);
4182
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004183const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004184PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004185{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004186 if (!PyUnicode_Check(unicode)) {
4187 PyErr_BadArgument();
4188 return NULL;
4189 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004190 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004191 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004192
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004193 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004194 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004195 return NULL;
4196 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004197 }
4198
4199 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004200 *psize = PyUnicode_UTF8_LENGTH(unicode);
4201 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004202}
4203
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004204const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004205PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004206{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004207 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4208}
4209
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004210Py_UNICODE *
4211PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4212{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004213 if (!PyUnicode_Check(unicode)) {
4214 PyErr_BadArgument();
4215 return NULL;
4216 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004217 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4218 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004219 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004220 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004221 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004222
Serhiy Storchakac46db922018-10-23 22:58:24 +03004223 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4224 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4225 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004226 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004227 }
Victor Stinner32bd68c2020-12-01 10:37:39 +01004228 w = (wchar_t *) PyObject_Malloc(sizeof(wchar_t) * (wlen + 1));
Serhiy Storchakac46db922018-10-23 22:58:24 +03004229 if (w == NULL) {
4230 PyErr_NoMemory();
4231 return NULL;
4232 }
4233 unicode_copy_as_widechar(unicode, w, wlen + 1);
4234 _PyUnicode_WSTR(unicode) = w;
4235 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4236 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004237 }
4238 }
4239 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004240 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004241 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004242}
4243
Inada Naoki2c4928d2020-06-17 20:09:44 +09004244/* Deprecated APIs */
4245
4246_Py_COMP_DIAG_PUSH
4247_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4248
Alexander Belopolsky40018472011-02-26 01:02:56 +00004249Py_UNICODE *
4250PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004251{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004252 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004253}
4254
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004255const Py_UNICODE *
4256_PyUnicode_AsUnicode(PyObject *unicode)
4257{
4258 Py_ssize_t size;
4259 const Py_UNICODE *wstr;
4260
4261 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4262 if (wstr && wcslen(wstr) != (size_t)size) {
4263 PyErr_SetString(PyExc_ValueError, "embedded null character");
4264 return NULL;
4265 }
4266 return wstr;
4267}
4268
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004269
Alexander Belopolsky40018472011-02-26 01:02:56 +00004270Py_ssize_t
4271PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004272{
4273 if (!PyUnicode_Check(unicode)) {
4274 PyErr_BadArgument();
4275 goto onError;
4276 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004277 if (_PyUnicode_WSTR(unicode) == NULL) {
4278 if (PyUnicode_AsUnicode(unicode) == NULL)
4279 goto onError;
4280 }
4281 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004282
Benjamin Peterson29060642009-01-31 22:14:21 +00004283 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004284 return -1;
4285}
4286
Inada Naoki2c4928d2020-06-17 20:09:44 +09004287_Py_COMP_DIAG_POP
4288
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004289Py_ssize_t
4290PyUnicode_GetLength(PyObject *unicode)
4291{
Victor Stinner07621332012-06-16 04:53:46 +02004292 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004293 PyErr_BadArgument();
4294 return -1;
4295 }
Victor Stinner07621332012-06-16 04:53:46 +02004296 if (PyUnicode_READY(unicode) == -1)
4297 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004298 return PyUnicode_GET_LENGTH(unicode);
4299}
4300
4301Py_UCS4
4302PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4303{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004304 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02004305 int kind;
4306
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004307 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004308 PyErr_BadArgument();
4309 return (Py_UCS4)-1;
4310 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004311 if (PyUnicode_READY(unicode) == -1) {
4312 return (Py_UCS4)-1;
4313 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004314 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004315 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004316 return (Py_UCS4)-1;
4317 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004318 data = PyUnicode_DATA(unicode);
4319 kind = PyUnicode_KIND(unicode);
4320 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004321}
4322
4323int
4324PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4325{
4326 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004327 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004328 return -1;
4329 }
Victor Stinner488fa492011-12-12 00:01:39 +01004330 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004331 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004332 PyErr_SetString(PyExc_IndexError, "string index out of range");
4333 return -1;
4334 }
Victor Stinner488fa492011-12-12 00:01:39 +01004335 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004336 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004337 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4338 PyErr_SetString(PyExc_ValueError, "character out of range");
4339 return -1;
4340 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004341 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4342 index, ch);
4343 return 0;
4344}
4345
Alexander Belopolsky40018472011-02-26 01:02:56 +00004346const char *
4347PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004348{
Victor Stinner42cb4622010-09-01 19:39:01 +00004349 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004350}
4351
Victor Stinner554f3f02010-06-16 23:33:54 +00004352/* create or adjust a UnicodeDecodeError */
4353static void
4354make_decode_exception(PyObject **exceptionObject,
4355 const char *encoding,
4356 const char *input, Py_ssize_t length,
4357 Py_ssize_t startpos, Py_ssize_t endpos,
4358 const char *reason)
4359{
4360 if (*exceptionObject == NULL) {
4361 *exceptionObject = PyUnicodeDecodeError_Create(
4362 encoding, input, length, startpos, endpos, reason);
4363 }
4364 else {
4365 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4366 goto onError;
4367 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4368 goto onError;
4369 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4370 goto onError;
4371 }
4372 return;
4373
4374onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004375 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004376}
4377
Steve Dowercc16be82016-09-08 10:35:16 -07004378#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004379static int
4380widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4381{
4382 if (newsize > *size) {
4383 wchar_t *newbuf = *buf;
4384 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4385 PyErr_NoMemory();
4386 return -1;
4387 }
4388 *buf = newbuf;
4389 }
4390 *size = newsize;
4391 return 0;
4392}
4393
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004394/* error handling callback helper:
4395 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004396 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004397 and adjust various state variables.
4398 return 0 on success, -1 on error
4399*/
4400
Alexander Belopolsky40018472011-02-26 01:02:56 +00004401static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004402unicode_decode_call_errorhandler_wchar(
4403 const char *errors, PyObject **errorHandler,
4404 const char *encoding, const char *reason,
4405 const char **input, const char **inend, Py_ssize_t *startinpos,
4406 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004407 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004408{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004409 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004410
4411 PyObject *restuple = NULL;
4412 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004413 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004414 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004415 Py_ssize_t requiredsize;
4416 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004417 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004418 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004419
4420 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004421 *errorHandler = PyCodec_LookupError(errors);
4422 if (*errorHandler == NULL)
4423 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004424 }
4425
Victor Stinner554f3f02010-06-16 23:33:54 +00004426 make_decode_exception(exceptionObject,
4427 encoding,
4428 *input, *inend - *input,
4429 *startinpos, *endinpos,
4430 reason);
4431 if (*exceptionObject == NULL)
4432 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004433
Petr Viktorinffd97532020-02-11 17:46:57 +01004434 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004436 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004437 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004438 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004439 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004440 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004441 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004442 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004443
4444 /* Copy back the bytes variables, which might have been modified by the
4445 callback */
4446 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4447 if (!inputobj)
4448 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004449 *input = PyBytes_AS_STRING(inputobj);
4450 insize = PyBytes_GET_SIZE(inputobj);
4451 *inend = *input + insize;
4452 /* we can DECREF safely, as the exception has another reference,
4453 so the object won't go away. */
4454 Py_DECREF(inputobj);
4455
4456 if (newpos<0)
4457 newpos = insize+newpos;
4458 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004459 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004460 goto onError;
4461 }
4462
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004463#if USE_UNICODE_WCHAR_CACHE
4464_Py_COMP_DIAG_PUSH
4465_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4466 repwlen = PyUnicode_GetSize(repunicode);
4467 if (repwlen < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004468 goto onError;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004469_Py_COMP_DIAG_POP
4470#else /* USE_UNICODE_WCHAR_CACHE */
4471 repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4472 if (repwlen < 0)
4473 goto onError;
4474 repwlen--;
4475#endif /* USE_UNICODE_WCHAR_CACHE */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004476 /* need more space? (at least enough for what we
4477 have+the replacement+the rest of the string (starting
4478 at the new input position), so we won't have to check space
4479 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004480 requiredsize = *outpos;
4481 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4482 goto overflow;
4483 requiredsize += repwlen;
4484 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4485 goto overflow;
4486 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004487 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004488 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004489 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004490 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004491 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004492 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004493 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004494 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004495 PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004496 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004497 *endinpos = newpos;
4498 *inptr = *input + newpos;
4499
4500 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004501 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004502 return 0;
4503
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004504 overflow:
4505 PyErr_SetString(PyExc_OverflowError,
4506 "decoded result is too long for a Python string");
4507
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004508 onError:
4509 Py_XDECREF(restuple);
4510 return -1;
4511}
Steve Dowercc16be82016-09-08 10:35:16 -07004512#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004513
4514static int
4515unicode_decode_call_errorhandler_writer(
4516 const char *errors, PyObject **errorHandler,
4517 const char *encoding, const char *reason,
4518 const char **input, const char **inend, Py_ssize_t *startinpos,
4519 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4520 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4521{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004522 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004523
4524 PyObject *restuple = NULL;
4525 PyObject *repunicode = NULL;
4526 Py_ssize_t insize;
4527 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004528 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004529 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004530 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004531 int need_to_grow = 0;
4532 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004533
4534 if (*errorHandler == NULL) {
4535 *errorHandler = PyCodec_LookupError(errors);
4536 if (*errorHandler == NULL)
4537 goto onError;
4538 }
4539
4540 make_decode_exception(exceptionObject,
4541 encoding,
4542 *input, *inend - *input,
4543 *startinpos, *endinpos,
4544 reason);
4545 if (*exceptionObject == NULL)
4546 goto onError;
4547
Petr Viktorinffd97532020-02-11 17:46:57 +01004548 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004549 if (restuple == NULL)
4550 goto onError;
4551 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004552 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004553 goto onError;
4554 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004555 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004556 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004557
4558 /* Copy back the bytes variables, which might have been modified by the
4559 callback */
4560 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4561 if (!inputobj)
4562 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004563 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004564 *input = PyBytes_AS_STRING(inputobj);
4565 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004566 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004567 /* we can DECREF safely, as the exception has another reference,
4568 so the object won't go away. */
4569 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004570
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004571 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004572 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004573 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004574 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004575 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004576 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004577
Victor Stinner170ca6f2013-04-18 00:25:28 +02004578 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004579 if (replen > 1) {
4580 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004581 need_to_grow = 1;
4582 }
4583 new_inptr = *input + newpos;
4584 if (*inend - new_inptr > remain) {
4585 /* We don't know the decoding algorithm here so we make the worst
4586 assumption that one byte decodes to one unicode character.
4587 If unfortunately one byte could decode to more unicode characters,
4588 the decoder may write out-of-bound then. Is it possible for the
4589 algorithms using this function? */
4590 writer->min_length += *inend - new_inptr - remain;
4591 need_to_grow = 1;
4592 }
4593 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004594 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004595 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004596 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4597 goto onError;
4598 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004599 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004600 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004601
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004602 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004603 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004604
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004605 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004606 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004607 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004608
Benjamin Peterson29060642009-01-31 22:14:21 +00004609 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004610 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004611 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004612}
4613
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004614/* --- UTF-7 Codec -------------------------------------------------------- */
4615
Antoine Pitrou244651a2009-05-04 18:56:13 +00004616/* See RFC2152 for details. We encode conservatively and decode liberally. */
4617
4618/* Three simple macros defining base-64. */
4619
4620/* Is c a base-64 character? */
4621
4622#define IS_BASE64(c) \
4623 (((c) >= 'A' && (c) <= 'Z') || \
4624 ((c) >= 'a' && (c) <= 'z') || \
4625 ((c) >= '0' && (c) <= '9') || \
4626 (c) == '+' || (c) == '/')
4627
4628/* given that c is a base-64 character, what is its base-64 value? */
4629
4630#define FROM_BASE64(c) \
4631 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4632 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4633 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4634 (c) == '+' ? 62 : 63)
4635
4636/* What is the base-64 character of the bottom 6 bits of n? */
4637
4638#define TO_BASE64(n) \
4639 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4640
4641/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4642 * decoded as itself. We are permissive on decoding; the only ASCII
4643 * byte not decoding to itself is the + which begins a base64
4644 * string. */
4645
4646#define DECODE_DIRECT(c) \
4647 ((c) <= 127 && (c) != '+')
4648
4649/* The UTF-7 encoder treats ASCII characters differently according to
4650 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4651 * the above). See RFC2152. This array identifies these different
4652 * sets:
4653 * 0 : "Set D"
4654 * alphanumeric and '(),-./:?
4655 * 1 : "Set O"
4656 * !"#$%&*;<=>@[]^_`{|}
4657 * 2 : "whitespace"
4658 * ht nl cr sp
4659 * 3 : special (must be base64 encoded)
4660 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4661 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004662
Tim Petersced69f82003-09-16 20:30:58 +00004663static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004664char utf7_category[128] = {
4665/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4666 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4667/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4668 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4669/* sp ! " # $ % & ' ( ) * + , - . / */
4670 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4671/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4672 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4673/* @ A B C D E F G H I J K L M N O */
4674 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4675/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4676 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4677/* ` a b c d e f g h i j k l m n o */
4678 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4679/* p q r s t u v w x y z { | } ~ del */
4680 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004681};
4682
Antoine Pitrou244651a2009-05-04 18:56:13 +00004683/* ENCODE_DIRECT: this character should be encoded as itself. The
4684 * answer depends on whether we are encoding set O as itself, and also
4685 * on whether we are encoding whitespace as itself. RFC2152 makes it
4686 * clear that the answers to these questions vary between
4687 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004688
Antoine Pitrou244651a2009-05-04 18:56:13 +00004689#define ENCODE_DIRECT(c, directO, directWS) \
4690 ((c) < 128 && (c) > 0 && \
4691 ((utf7_category[(c)] == 0) || \
4692 (directWS && (utf7_category[(c)] == 2)) || \
4693 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004694
Alexander Belopolsky40018472011-02-26 01:02:56 +00004695PyObject *
4696PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004697 Py_ssize_t size,
4698 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004699{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004700 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4701}
4702
Antoine Pitrou244651a2009-05-04 18:56:13 +00004703/* The decoder. The only state we preserve is our read position,
4704 * i.e. how many characters we have consumed. So if we end in the
4705 * middle of a shift sequence we have to back off the read position
4706 * and the output to the beginning of the sequence, otherwise we lose
4707 * all the shift state (seen bits, number of bits seen, high
4708 * surrogate). */
4709
Alexander Belopolsky40018472011-02-26 01:02:56 +00004710PyObject *
4711PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004712 Py_ssize_t size,
4713 const char *errors,
4714 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004715{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004716 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004717 Py_ssize_t startinpos;
4718 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004719 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004720 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004721 const char *errmsg = "";
4722 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004723 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004724 unsigned int base64bits = 0;
4725 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004726 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004727 PyObject *errorHandler = NULL;
4728 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004729
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004730 if (size == 0) {
4731 if (consumed)
4732 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004733 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004734 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004735
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004736 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004737 _PyUnicodeWriter_Init(&writer);
4738 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004739
4740 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004741 e = s + size;
4742
4743 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004744 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004745 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004746 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004747
Antoine Pitrou244651a2009-05-04 18:56:13 +00004748 if (inShift) { /* in a base-64 section */
4749 if (IS_BASE64(ch)) { /* consume a base-64 character */
4750 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4751 base64bits += 6;
4752 s++;
4753 if (base64bits >= 16) {
4754 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004755 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004756 base64bits -= 16;
4757 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004758 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004759 if (surrogate) {
4760 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004761 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4762 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004763 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004764 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004765 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004766 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004767 }
4768 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004769 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004770 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004771 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004772 }
4773 }
Victor Stinner551ac952011-11-29 22:58:13 +01004774 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004775 /* first surrogate */
4776 surrogate = outCh;
4777 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004778 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004779 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004780 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004781 }
4782 }
4783 }
4784 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004785 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004786 if (base64bits > 0) { /* left-over bits */
4787 if (base64bits >= 6) {
4788 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004789 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004790 errmsg = "partial character in shift sequence";
4791 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004792 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004793 else {
4794 /* Some bits remain; they should be zero */
4795 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004796 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004797 errmsg = "non-zero padding bits in shift sequence";
4798 goto utf7Error;
4799 }
4800 }
4801 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004802 if (surrogate && DECODE_DIRECT(ch)) {
4803 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4804 goto onError;
4805 }
4806 surrogate = 0;
4807 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004808 /* '-' is absorbed; other terminating
4809 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004810 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004811 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004812 }
4813 }
4814 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004815 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004816 s++; /* consume '+' */
4817 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004818 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004819 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004820 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004821 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004822 else if (s < e && !IS_BASE64(*s)) {
4823 s++;
4824 errmsg = "ill-formed sequence";
4825 goto utf7Error;
4826 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004827 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004828 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004829 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004830 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004831 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004832 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004833 }
4834 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004835 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004836 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004837 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004838 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004839 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004840 else {
4841 startinpos = s-starts;
4842 s++;
4843 errmsg = "unexpected special character";
4844 goto utf7Error;
4845 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004846 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004847utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004848 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004849 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004850 errors, &errorHandler,
4851 "utf7", errmsg,
4852 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004853 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004854 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004855 }
4856
Antoine Pitrou244651a2009-05-04 18:56:13 +00004857 /* end of string */
4858
4859 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4860 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004861 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004862 if (surrogate ||
4863 (base64bits >= 6) ||
4864 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004865 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004866 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004867 errors, &errorHandler,
4868 "utf7", "unterminated shift sequence",
4869 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004870 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004871 goto onError;
4872 if (s < e)
4873 goto restart;
4874 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004875 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004876
4877 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004878 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004879 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004880 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004881 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004882 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004883 writer.kind, writer.data, shiftOutStart);
4884 Py_XDECREF(errorHandler);
4885 Py_XDECREF(exc);
4886 _PyUnicodeWriter_Dealloc(&writer);
4887 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004888 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004889 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004890 }
4891 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004892 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004893 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004894 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004895
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004896 Py_XDECREF(errorHandler);
4897 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004898 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004899
Benjamin Peterson29060642009-01-31 22:14:21 +00004900 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004901 Py_XDECREF(errorHandler);
4902 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004903 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004904 return NULL;
4905}
4906
4907
Alexander Belopolsky40018472011-02-26 01:02:56 +00004908PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004909_PyUnicode_EncodeUTF7(PyObject *str,
4910 int base64SetO,
4911 int base64WhiteSpace,
4912 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004913{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004914 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004915 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004916 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004917 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004918 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004919 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004920 unsigned int base64bits = 0;
4921 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004922 char * out;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004923 const char * start;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004924
Benjamin Petersonbac79492012-01-14 13:34:47 -05004925 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004926 return NULL;
4927 kind = PyUnicode_KIND(str);
4928 data = PyUnicode_DATA(str);
4929 len = PyUnicode_GET_LENGTH(str);
4930
4931 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004932 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004933
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004934 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004935 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004936 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004937 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004938 if (v == NULL)
4939 return NULL;
4940
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004941 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004942 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004943 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004944
Antoine Pitrou244651a2009-05-04 18:56:13 +00004945 if (inShift) {
4946 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4947 /* shifting out */
4948 if (base64bits) { /* output remaining bits */
4949 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4950 base64buffer = 0;
4951 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004952 }
4953 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004954 /* Characters not in the BASE64 set implicitly unshift the sequence
4955 so no '-' is required, except if the character is itself a '-' */
4956 if (IS_BASE64(ch) || ch == '-') {
4957 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004958 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004959 *out++ = (char) ch;
4960 }
4961 else {
4962 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004963 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004964 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004965 else { /* not in a shift sequence */
4966 if (ch == '+') {
4967 *out++ = '+';
4968 *out++ = '-';
4969 }
4970 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4971 *out++ = (char) ch;
4972 }
4973 else {
4974 *out++ = '+';
4975 inShift = 1;
4976 goto encode_char;
4977 }
4978 }
4979 continue;
4980encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004981 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004982 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004983
Antoine Pitrou244651a2009-05-04 18:56:13 +00004984 /* code first surrogate */
4985 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004986 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004987 while (base64bits >= 6) {
4988 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4989 base64bits -= 6;
4990 }
4991 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004992 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004993 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004994 base64bits += 16;
4995 base64buffer = (base64buffer << 16) | ch;
4996 while (base64bits >= 6) {
4997 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4998 base64bits -= 6;
4999 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00005000 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00005001 if (base64bits)
5002 *out++= TO_BASE64(base64buffer << (6-base64bits) );
5003 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005004 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005005 if (_PyBytes_Resize(&v, out - start) < 0)
5006 return NULL;
5007 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005008}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005009PyObject *
5010PyUnicode_EncodeUTF7(const Py_UNICODE *s,
5011 Py_ssize_t size,
5012 int base64SetO,
5013 int base64WhiteSpace,
5014 const char *errors)
5015{
5016 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005017 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005018 if (tmp == NULL)
5019 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01005020 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005021 base64WhiteSpace, errors);
5022 Py_DECREF(tmp);
5023 return result;
5024}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005025
Antoine Pitrou244651a2009-05-04 18:56:13 +00005026#undef IS_BASE64
5027#undef FROM_BASE64
5028#undef TO_BASE64
5029#undef DECODE_DIRECT
5030#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005031
Guido van Rossumd57fd912000-03-10 22:53:23 +00005032/* --- UTF-8 Codec -------------------------------------------------------- */
5033
Alexander Belopolsky40018472011-02-26 01:02:56 +00005034PyObject *
5035PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005036 Py_ssize_t size,
5037 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005038{
Walter Dörwald69652032004-09-07 20:24:22 +00005039 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
5040}
5041
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005042#include "stringlib/asciilib.h"
5043#include "stringlib/codecs.h"
5044#include "stringlib/undef.h"
5045
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01005046#include "stringlib/ucs1lib.h"
5047#include "stringlib/codecs.h"
5048#include "stringlib/undef.h"
5049
5050#include "stringlib/ucs2lib.h"
5051#include "stringlib/codecs.h"
5052#include "stringlib/undef.h"
5053
5054#include "stringlib/ucs4lib.h"
5055#include "stringlib/codecs.h"
5056#include "stringlib/undef.h"
5057
Ma Lina0c603c2020-10-18 22:48:38 +08005058/* Mask to quickly check whether a C 'size_t' contains a
Antoine Pitrouab868312009-01-10 15:40:25 +00005059 non-ASCII, UTF8-encoded char. */
Ma Lina0c603c2020-10-18 22:48:38 +08005060#if (SIZEOF_SIZE_T == 8)
5061# define ASCII_CHAR_MASK 0x8080808080808080ULL
5062#elif (SIZEOF_SIZE_T == 4)
5063# define ASCII_CHAR_MASK 0x80808080U
Antoine Pitrouab868312009-01-10 15:40:25 +00005064#else
Ma Lina0c603c2020-10-18 22:48:38 +08005065# error C 'size_t' size should be either 4 or 8!
Antoine Pitrouab868312009-01-10 15:40:25 +00005066#endif
5067
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005068static Py_ssize_t
5069ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005070{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005071 const char *p = start;
Ma Lina0c603c2020-10-18 22:48:38 +08005072 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_SIZE_T);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005073
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02005074 /*
5075 * Issue #17237: m68k is a bit different from most architectures in
5076 * that objects do not use "natural alignment" - for example, int and
5077 * long are only aligned at 2-byte boundaries. Therefore the assert()
5078 * won't work; also, tests have shown that skipping the "optimised
5079 * version" will even speed up m68k.
5080 */
5081#if !defined(__m68k__)
Ma Lina0c603c2020-10-18 22:48:38 +08005082#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5083 assert(_Py_IS_ALIGNED(dest, SIZEOF_SIZE_T));
5084 if (_Py_IS_ALIGNED(p, SIZEOF_SIZE_T)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005085 /* Fast path, see in STRINGLIB(utf8_decode) for
5086 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02005087 /* Help allocation */
5088 const char *_p = p;
5089 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005090 while (_p < aligned_end) {
Ma Lina0c603c2020-10-18 22:48:38 +08005091 size_t value = *(const size_t *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005092 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00005093 break;
Ma Lina0c603c2020-10-18 22:48:38 +08005094 *((size_t *)q) = value;
5095 _p += SIZEOF_SIZE_T;
5096 q += SIZEOF_SIZE_T;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005097 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005098 p = _p;
5099 while (p < end) {
5100 if ((unsigned char)*p & 0x80)
5101 break;
5102 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005104 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005105 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005106#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02005107#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005108 while (p < end) {
5109 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5110 for an explanation. */
Ma Lina0c603c2020-10-18 22:48:38 +08005111 if (_Py_IS_ALIGNED(p, SIZEOF_SIZE_T)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02005112 /* Help allocation */
5113 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005114 while (_p < aligned_end) {
Ma Lina0c603c2020-10-18 22:48:38 +08005115 size_t value = *(const size_t *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005116 if (value & ASCII_CHAR_MASK)
5117 break;
Ma Lina0c603c2020-10-18 22:48:38 +08005118 _p += SIZEOF_SIZE_T;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005119 }
5120 p = _p;
5121 if (_p == end)
5122 break;
5123 }
5124 if ((unsigned char)*p & 0x80)
5125 break;
5126 ++p;
5127 }
5128 memcpy(dest, start, p - start);
5129 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005130}
Antoine Pitrouab868312009-01-10 15:40:25 +00005131
Victor Stinner709d23d2019-05-02 14:56:30 -04005132static PyObject *
5133unicode_decode_utf8(const char *s, Py_ssize_t size,
5134 _Py_error_handler error_handler, const char *errors,
5135 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01005136{
Victor Stinner785938e2011-12-11 20:09:03 +01005137 if (size == 0) {
5138 if (consumed)
5139 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005140 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01005141 }
5142
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005143 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5144 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner2f9ada92020-06-24 02:22:21 +02005145 if (consumed) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005146 *consumed = 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02005147 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005148 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01005149 }
5150
Inada Naoki770847a2019-06-24 12:30:24 +09005151 const char *starts = s;
5152 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01005153
Inada Naoki770847a2019-06-24 12:30:24 +09005154 // fast path: try ASCII string.
5155 PyObject *u = PyUnicode_New(size, 127);
5156 if (u == NULL) {
5157 return NULL;
5158 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005159 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09005160 if (s == end) {
5161 return u;
5162 }
5163
5164 // Use _PyUnicodeWriter after fast path is failed.
5165 _PyUnicodeWriter writer;
5166 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5167 writer.pos = s - starts;
5168
5169 Py_ssize_t startinpos, endinpos;
5170 const char *errmsg = "";
5171 PyObject *error_handler_obj = NULL;
5172 PyObject *exc = NULL;
5173
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005174 while (s < end) {
5175 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005176 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005177
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005178 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005179 if (PyUnicode_IS_ASCII(writer.buffer))
5180 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005181 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005182 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005183 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005184 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005185 } else {
5186 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005187 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005188 }
5189
5190 switch (ch) {
5191 case 0:
5192 if (s == end || consumed)
5193 goto End;
5194 errmsg = "unexpected end of data";
5195 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005196 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005197 break;
5198 case 1:
5199 errmsg = "invalid start byte";
5200 startinpos = s - starts;
5201 endinpos = startinpos + 1;
5202 break;
5203 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005204 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5205 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5206 {
5207 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005208 goto End;
5209 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005210 /* fall through */
5211 case 3:
5212 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005213 errmsg = "invalid continuation byte";
5214 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005215 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005216 break;
5217 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005218 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005219 goto onError;
5220 continue;
5221 }
5222
Victor Stinner1d65d912015-10-05 13:43:50 +02005223 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005224 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005225
5226 switch (error_handler) {
5227 case _Py_ERROR_IGNORE:
5228 s += (endinpos - startinpos);
5229 break;
5230
5231 case _Py_ERROR_REPLACE:
5232 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5233 goto onError;
5234 s += (endinpos - startinpos);
5235 break;
5236
5237 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005238 {
5239 Py_ssize_t i;
5240
Victor Stinner1d65d912015-10-05 13:43:50 +02005241 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5242 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005243 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005244 ch = (Py_UCS4)(unsigned char)(starts[i]);
5245 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5246 ch + 0xdc00);
5247 writer.pos++;
5248 }
5249 s += (endinpos - startinpos);
5250 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005251 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005252
5253 default:
5254 if (unicode_decode_call_errorhandler_writer(
5255 errors, &error_handler_obj,
5256 "utf-8", errmsg,
5257 &starts, &end, &startinpos, &endinpos, &exc, &s,
5258 &writer))
5259 goto onError;
5260 }
Victor Stinner785938e2011-12-11 20:09:03 +01005261 }
5262
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005263End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005264 if (consumed)
5265 *consumed = s - starts;
5266
Victor Stinner1d65d912015-10-05 13:43:50 +02005267 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005268 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005269 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005270
5271onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005272 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005273 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005274 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005275 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005276}
5277
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005278
Victor Stinner709d23d2019-05-02 14:56:30 -04005279PyObject *
5280PyUnicode_DecodeUTF8Stateful(const char *s,
5281 Py_ssize_t size,
5282 const char *errors,
5283 Py_ssize_t *consumed)
5284{
5285 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5286}
5287
5288
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005289/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5290 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005291
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005292 On success, write a pointer to a newly allocated wide character string into
5293 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5294 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005295
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005296 On memory allocation failure, return -1.
5297
5298 On decoding error (if surrogateescape is zero), return -2. If wlen is
5299 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5300 is not NULL, write the decoding error message into *reason. */
5301int
5302_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005303 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005304{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005305 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005306 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005307 wchar_t *unicode;
5308 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005309
Victor Stinner3d4226a2018-08-29 22:21:32 +02005310 int surrogateescape = 0;
5311 int surrogatepass = 0;
5312 switch (errors)
5313 {
5314 case _Py_ERROR_STRICT:
5315 break;
5316 case _Py_ERROR_SURROGATEESCAPE:
5317 surrogateescape = 1;
5318 break;
5319 case _Py_ERROR_SURROGATEPASS:
5320 surrogatepass = 1;
5321 break;
5322 default:
5323 return -3;
5324 }
5325
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005326 /* Note: size will always be longer than the resulting Unicode
5327 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005328 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005329 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005330 }
5331
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005332 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005333 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005334 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005335 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005336
5337 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005338 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005339 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005340 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005341 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005342#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005343 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005344#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005345 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005346#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005347 if (ch > 0xFF) {
5348#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005349 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005350#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005351 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005352 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005353 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5354 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5355#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005356 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005357 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005358 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005359 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005360 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005361
5362 if (surrogateescape) {
5363 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5364 }
5365 else {
5366 /* Is it a valid three-byte code? */
5367 if (surrogatepass
5368 && (e - s) >= 3
5369 && (s[0] & 0xf0) == 0xe0
5370 && (s[1] & 0xc0) == 0x80
5371 && (s[2] & 0xc0) == 0x80)
5372 {
5373 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5374 s += 3;
5375 unicode[outpos++] = ch;
5376 }
5377 else {
5378 PyMem_RawFree(unicode );
5379 if (reason != NULL) {
5380 switch (ch) {
5381 case 0:
5382 *reason = "unexpected end of data";
5383 break;
5384 case 1:
5385 *reason = "invalid start byte";
5386 break;
5387 /* 2, 3, 4 */
5388 default:
5389 *reason = "invalid continuation byte";
5390 break;
5391 }
5392 }
5393 if (wlen != NULL) {
5394 *wlen = s - orig_s;
5395 }
5396 return -2;
5397 }
5398 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005399 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005400 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005401 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005402 if (wlen) {
5403 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005404 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005405 *wstr = unicode;
5406 return 0;
5407}
5408
Victor Stinner5f9cf232019-03-19 01:46:25 +01005409
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005410wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005411_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5412 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005413{
5414 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005415 int res = _Py_DecodeUTF8Ex(arg, arglen,
5416 &wstr, wlen,
5417 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005418 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005419 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5420 assert(res != -3);
5421 if (wlen) {
5422 *wlen = (size_t)res;
5423 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005424 return NULL;
5425 }
5426 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005427}
5428
Antoine Pitrouab868312009-01-10 15:40:25 +00005429
Victor Stinnere47e6982017-12-21 15:45:16 +01005430/* UTF-8 encoder using the surrogateescape error handler .
5431
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005432 On success, return 0 and write the newly allocated character string (use
5433 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005434
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005435 On encoding failure, return -2 and write the position of the invalid
5436 surrogate character into *error_pos (if error_pos is set) and the decoding
5437 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005438
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005439 On memory allocation failure, return -1. */
5440int
5441_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005442 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005443{
5444 const Py_ssize_t max_char_size = 4;
5445 Py_ssize_t len = wcslen(text);
5446
5447 assert(len >= 0);
5448
Victor Stinner3d4226a2018-08-29 22:21:32 +02005449 int surrogateescape = 0;
5450 int surrogatepass = 0;
5451 switch (errors)
5452 {
5453 case _Py_ERROR_STRICT:
5454 break;
5455 case _Py_ERROR_SURROGATEESCAPE:
5456 surrogateescape = 1;
5457 break;
5458 case _Py_ERROR_SURROGATEPASS:
5459 surrogatepass = 1;
5460 break;
5461 default:
5462 return -3;
5463 }
5464
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005465 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5466 return -1;
5467 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005468 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005469 if (raw_malloc) {
5470 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005471 }
5472 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005473 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005474 }
5475 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005476 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005477 }
5478
5479 char *p = bytes;
5480 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005481 for (i = 0; i < len; ) {
5482 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005483 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005484 i++;
5485#if Py_UNICODE_SIZE == 2
5486 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5487 && i < len
5488 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5489 {
5490 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5491 i++;
5492 }
5493#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005494
5495 if (ch < 0x80) {
5496 /* Encode ASCII */
5497 *p++ = (char) ch;
5498
5499 }
5500 else if (ch < 0x0800) {
5501 /* Encode Latin-1 */
5502 *p++ = (char)(0xc0 | (ch >> 6));
5503 *p++ = (char)(0x80 | (ch & 0x3f));
5504 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005505 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005506 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005507 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005508 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005509 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005510 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005511 if (reason != NULL) {
5512 *reason = "encoding error";
5513 }
5514 if (raw_malloc) {
5515 PyMem_RawFree(bytes);
5516 }
5517 else {
5518 PyMem_Free(bytes);
5519 }
5520 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005521 }
5522 *p++ = (char)(ch & 0xff);
5523 }
5524 else if (ch < 0x10000) {
5525 *p++ = (char)(0xe0 | (ch >> 12));
5526 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5527 *p++ = (char)(0x80 | (ch & 0x3f));
5528 }
5529 else { /* ch >= 0x10000 */
5530 assert(ch <= MAX_UNICODE);
5531 /* Encode UCS4 Unicode ordinals */
5532 *p++ = (char)(0xf0 | (ch >> 18));
5533 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5534 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5535 *p++ = (char)(0x80 | (ch & 0x3f));
5536 }
5537 }
5538 *p++ = '\0';
5539
5540 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005541 char *bytes2;
5542 if (raw_malloc) {
5543 bytes2 = PyMem_RawRealloc(bytes, final_size);
5544 }
5545 else {
5546 bytes2 = PyMem_Realloc(bytes, final_size);
5547 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005548 if (bytes2 == NULL) {
5549 if (error_pos != NULL) {
5550 *error_pos = (size_t)-1;
5551 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005552 if (raw_malloc) {
5553 PyMem_RawFree(bytes);
5554 }
5555 else {
5556 PyMem_Free(bytes);
5557 }
5558 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005559 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005560 *str = bytes2;
5561 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005562}
5563
5564
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005565/* Primary internal function which creates utf8 encoded bytes objects.
5566
5567 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005568 and allocate exactly as much space needed at the end. Else allocate the
5569 maximum possible needed (4 result bytes per Unicode character), and return
5570 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005571*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005572static PyObject *
5573unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5574 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005575{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005576 if (!PyUnicode_Check(unicode)) {
5577 PyErr_BadArgument();
5578 return NULL;
5579 }
5580
5581 if (PyUnicode_READY(unicode) == -1)
5582 return NULL;
5583
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005584 if (PyUnicode_UTF8(unicode))
5585 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5586 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005587
Inada Naoki02a4d572020-02-27 13:48:59 +09005588 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005589 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005590 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5591
5592 _PyBytesWriter writer;
5593 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005594
Benjamin Petersonead6b532011-12-20 17:23:42 -06005595 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005596 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005597 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005598 case PyUnicode_1BYTE_KIND:
5599 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5600 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005601 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5602 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005603 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005604 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5605 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005606 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005607 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5608 break;
Tim Peters602f7402002-04-27 18:03:26 +00005609 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005610
5611 if (end == NULL) {
5612 _PyBytesWriter_Dealloc(&writer);
5613 return NULL;
5614 }
5615 return _PyBytesWriter_Finish(&writer, end);
5616}
5617
5618static int
5619unicode_fill_utf8(PyObject *unicode)
5620{
5621 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5622 assert(!PyUnicode_IS_ASCII(unicode));
5623
5624 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005625 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005626 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5627
5628 _PyBytesWriter writer;
5629 char *end;
5630
5631 switch (kind) {
5632 default:
5633 Py_UNREACHABLE();
5634 case PyUnicode_1BYTE_KIND:
5635 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5636 _Py_ERROR_STRICT, NULL);
5637 break;
5638 case PyUnicode_2BYTE_KIND:
5639 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5640 _Py_ERROR_STRICT, NULL);
5641 break;
5642 case PyUnicode_4BYTE_KIND:
5643 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5644 _Py_ERROR_STRICT, NULL);
5645 break;
5646 }
5647 if (end == NULL) {
5648 _PyBytesWriter_Dealloc(&writer);
5649 return -1;
5650 }
5651
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03005652 const char *start = writer.use_small_buffer ? writer.small_buffer :
Inada Naoki02a4d572020-02-27 13:48:59 +09005653 PyBytes_AS_STRING(writer.buffer);
5654 Py_ssize_t len = end - start;
5655
Victor Stinner32bd68c2020-12-01 10:37:39 +01005656 char *cache = PyObject_Malloc(len + 1);
Inada Naoki02a4d572020-02-27 13:48:59 +09005657 if (cache == NULL) {
5658 _PyBytesWriter_Dealloc(&writer);
5659 PyErr_NoMemory();
5660 return -1;
5661 }
5662 _PyUnicode_UTF8(unicode) = cache;
5663 _PyUnicode_UTF8_LENGTH(unicode) = len;
5664 memcpy(cache, start, len);
5665 cache[len] = '\0';
5666 _PyBytesWriter_Dealloc(&writer);
5667 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005668}
5669
Alexander Belopolsky40018472011-02-26 01:02:56 +00005670PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005671_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5672{
5673 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5674}
5675
5676
5677PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005678PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5679 Py_ssize_t size,
5680 const char *errors)
5681{
5682 PyObject *v, *unicode;
5683
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005684 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005685 if (unicode == NULL)
5686 return NULL;
5687 v = _PyUnicode_AsUTF8String(unicode, errors);
5688 Py_DECREF(unicode);
5689 return v;
5690}
5691
5692PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005693PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005695 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696}
5697
Walter Dörwald41980ca2007-08-16 21:55:45 +00005698/* --- UTF-32 Codec ------------------------------------------------------- */
5699
5700PyObject *
5701PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005702 Py_ssize_t size,
5703 const char *errors,
5704 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005705{
5706 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5707}
5708
5709PyObject *
5710PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005711 Py_ssize_t size,
5712 const char *errors,
5713 int *byteorder,
5714 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005715{
5716 const char *starts = s;
5717 Py_ssize_t startinpos;
5718 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005719 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005720 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005721 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005722 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005723 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005724 PyObject *errorHandler = NULL;
5725 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005726
Andy Lestere6be9b52020-02-11 20:28:35 -06005727 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005728 e = q + size;
5729
5730 if (byteorder)
5731 bo = *byteorder;
5732
5733 /* Check for BOM marks (U+FEFF) in the input and adjust current
5734 byte order setting accordingly. In native mode, the leading BOM
5735 mark is skipped, in all other modes, it is copied to the output
5736 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005737 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005738 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005739 if (bom == 0x0000FEFF) {
5740 bo = -1;
5741 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005742 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005743 else if (bom == 0xFFFE0000) {
5744 bo = 1;
5745 q += 4;
5746 }
5747 if (byteorder)
5748 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005749 }
5750
Victor Stinnere64322e2012-10-30 23:12:47 +01005751 if (q == e) {
5752 if (consumed)
5753 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005754 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005755 }
5756
Victor Stinnere64322e2012-10-30 23:12:47 +01005757#ifdef WORDS_BIGENDIAN
5758 le = bo < 0;
5759#else
5760 le = bo <= 0;
5761#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005762 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005763
Victor Stinner8f674cc2013-04-17 23:02:17 +02005764 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005765 writer.min_length = (e - q + 3) / 4;
5766 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005767 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005768
Victor Stinnere64322e2012-10-30 23:12:47 +01005769 while (1) {
5770 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005771 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005772
Victor Stinnere64322e2012-10-30 23:12:47 +01005773 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005774 enum PyUnicode_Kind kind = writer.kind;
5775 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005776 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005777 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005778 if (le) {
5779 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005780 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005781 if (ch > maxch)
5782 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005783 if (kind != PyUnicode_1BYTE_KIND &&
5784 Py_UNICODE_IS_SURROGATE(ch))
5785 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005786 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005787 q += 4;
5788 } while (q <= last);
5789 }
5790 else {
5791 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005792 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005793 if (ch > maxch)
5794 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005795 if (kind != PyUnicode_1BYTE_KIND &&
5796 Py_UNICODE_IS_SURROGATE(ch))
5797 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005798 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005799 q += 4;
5800 } while (q <= last);
5801 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005802 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005803 }
5804
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005805 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005806 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005807 startinpos = ((const char *)q) - starts;
5808 endinpos = startinpos + 4;
5809 }
5810 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005811 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005812 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005813 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005814 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005815 startinpos = ((const char *)q) - starts;
5816 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005817 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005818 else {
5819 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005820 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005821 goto onError;
5822 q += 4;
5823 continue;
5824 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005825 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005826 startinpos = ((const char *)q) - starts;
5827 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005828 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005829
5830 /* The remaining input chars are ignored if the callback
5831 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005832 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005833 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005834 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005835 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005836 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005837 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005838 }
5839
Walter Dörwald41980ca2007-08-16 21:55:45 +00005840 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005841 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005842
Walter Dörwald41980ca2007-08-16 21:55:45 +00005843 Py_XDECREF(errorHandler);
5844 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005845 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005846
Benjamin Peterson29060642009-01-31 22:14:21 +00005847 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005848 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005849 Py_XDECREF(errorHandler);
5850 Py_XDECREF(exc);
5851 return NULL;
5852}
5853
5854PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005855_PyUnicode_EncodeUTF32(PyObject *str,
5856 const char *errors,
5857 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005858{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005859 enum PyUnicode_Kind kind;
5860 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005861 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005862 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005863 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005864#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005865 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005866#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005867 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005868#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005869 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005870 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005871 PyObject *errorHandler = NULL;
5872 PyObject *exc = NULL;
5873 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005874
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005875 if (!PyUnicode_Check(str)) {
5876 PyErr_BadArgument();
5877 return NULL;
5878 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005879 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005880 return NULL;
5881 kind = PyUnicode_KIND(str);
5882 data = PyUnicode_DATA(str);
5883 len = PyUnicode_GET_LENGTH(str);
5884
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005885 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005886 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005887 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005888 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005889 if (v == NULL)
5890 return NULL;
5891
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005892 /* output buffer is 4-bytes aligned */
5893 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005894 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005895 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005896 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005897 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005898 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005899
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005900 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005901 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005902 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005903 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005904 else
5905 encoding = "utf-32";
5906
5907 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005908 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5909 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005910 }
5911
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005912 pos = 0;
5913 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005914 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005915
5916 if (kind == PyUnicode_2BYTE_KIND) {
5917 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5918 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005919 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005920 else {
5921 assert(kind == PyUnicode_4BYTE_KIND);
5922 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5923 &out, native_ordering);
5924 }
5925 if (pos == len)
5926 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005927
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005928 rep = unicode_encode_call_errorhandler(
5929 errors, &errorHandler,
5930 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005931 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005932 if (!rep)
5933 goto error;
5934
5935 if (PyBytes_Check(rep)) {
5936 repsize = PyBytes_GET_SIZE(rep);
5937 if (repsize & 3) {
5938 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005939 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005940 "surrogates not allowed");
5941 goto error;
5942 }
5943 moreunits = repsize / 4;
5944 }
5945 else {
5946 assert(PyUnicode_Check(rep));
5947 if (PyUnicode_READY(rep) < 0)
5948 goto error;
5949 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5950 if (!PyUnicode_IS_ASCII(rep)) {
5951 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005952 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005953 "surrogates not allowed");
5954 goto error;
5955 }
5956 }
5957
5958 /* four bytes are reserved for each surrogate */
5959 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005960 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005961 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005962 /* integer overflow */
5963 PyErr_NoMemory();
5964 goto error;
5965 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005966 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005967 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005968 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005969 }
5970
5971 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005972 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005973 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005974 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005975 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005976 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5977 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005978 }
5979
5980 Py_CLEAR(rep);
5981 }
5982
5983 /* Cut back to size actually needed. This is necessary for, for example,
5984 encoding of a string containing isolated surrogates and the 'ignore'
5985 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005986 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005987 if (nsize != PyBytes_GET_SIZE(v))
5988 _PyBytes_Resize(&v, nsize);
5989 Py_XDECREF(errorHandler);
5990 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005991 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005992 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005993 error:
5994 Py_XDECREF(rep);
5995 Py_XDECREF(errorHandler);
5996 Py_XDECREF(exc);
5997 Py_XDECREF(v);
5998 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005999}
6000
Alexander Belopolsky40018472011-02-26 01:02:56 +00006001PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006002PyUnicode_EncodeUTF32(const Py_UNICODE *s,
6003 Py_ssize_t size,
6004 const char *errors,
6005 int byteorder)
6006{
6007 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006008 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006009 if (tmp == NULL)
6010 return NULL;
6011 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
6012 Py_DECREF(tmp);
6013 return result;
6014}
6015
6016PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006017PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00006018{
Victor Stinnerb960b342011-11-20 19:12:52 +01006019 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00006020}
6021
Guido van Rossumd57fd912000-03-10 22:53:23 +00006022/* --- UTF-16 Codec ------------------------------------------------------- */
6023
Tim Peters772747b2001-08-09 22:21:55 +00006024PyObject *
6025PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006026 Py_ssize_t size,
6027 const char *errors,
6028 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029{
Walter Dörwald69652032004-09-07 20:24:22 +00006030 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6031}
6032
6033PyObject *
6034PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006035 Py_ssize_t size,
6036 const char *errors,
6037 int *byteorder,
6038 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00006039{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006040 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006041 Py_ssize_t startinpos;
6042 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006043 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006044 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00006045 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02006046 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00006047 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006048 PyObject *errorHandler = NULL;
6049 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006050 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051
Andy Lestere6be9b52020-02-11 20:28:35 -06006052 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006053 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054
6055 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00006056 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00006058 /* Check for BOM marks (U+FEFF) in the input and adjust current
6059 byte order setting accordingly. In native mode, the leading BOM
6060 mark is skipped, in all other modes, it is copied to the output
6061 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02006062 if (bo == 0 && size >= 2) {
6063 const Py_UCS4 bom = (q[1] << 8) | q[0];
6064 if (bom == 0xFEFF) {
6065 q += 2;
6066 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006067 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02006068 else if (bom == 0xFFFE) {
6069 q += 2;
6070 bo = 1;
6071 }
6072 if (byteorder)
6073 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00006074 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006075
Antoine Pitrou63065d72012-05-15 23:48:04 +02006076 if (q == e) {
6077 if (consumed)
6078 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006079 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00006080 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02006081
Christian Heimes743e0cd2012-10-17 23:52:17 +02006082#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02006083 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006084 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00006085#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02006086 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006087 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00006088#endif
Tim Peters772747b2001-08-09 22:21:55 +00006089
Antoine Pitrou63065d72012-05-15 23:48:04 +02006090 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08006091 character count normally. Error handler will take care of
6092 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006093 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006094 writer.min_length = (e - q + 1) / 2;
6095 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006096 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006097
Antoine Pitrou63065d72012-05-15 23:48:04 +02006098 while (1) {
6099 Py_UCS4 ch = 0;
6100 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006101 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006102 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006103 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02006104 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006105 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006106 native_ordering);
6107 else
6108 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006109 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006110 native_ordering);
6111 } else if (kind == PyUnicode_2BYTE_KIND) {
6112 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006113 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006114 native_ordering);
6115 } else {
6116 assert(kind == PyUnicode_4BYTE_KIND);
6117 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006118 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006119 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00006120 }
Antoine Pitrouab868312009-01-10 15:40:25 +00006121 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006122
Antoine Pitrou63065d72012-05-15 23:48:04 +02006123 switch (ch)
6124 {
6125 case 0:
6126 /* remaining byte at the end? (size should be even) */
6127 if (q == e || consumed)
6128 goto End;
6129 errmsg = "truncated data";
6130 startinpos = ((const char *)q) - starts;
6131 endinpos = ((const char *)e) - starts;
6132 break;
6133 /* The remaining input chars are ignored if the callback
6134 chooses to skip the input */
6135 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006136 q -= 2;
6137 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02006138 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006139 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006140 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006141 endinpos = ((const char *)e) - starts;
6142 break;
6143 case 2:
6144 errmsg = "illegal encoding";
6145 startinpos = ((const char *)q) - 2 - starts;
6146 endinpos = startinpos + 2;
6147 break;
6148 case 3:
6149 errmsg = "illegal UTF-16 surrogate";
6150 startinpos = ((const char *)q) - 4 - starts;
6151 endinpos = startinpos + 2;
6152 break;
6153 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006154 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006155 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006156 continue;
6157 }
6158
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006159 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00006160 errors,
6161 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006162 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00006163 &starts,
6164 (const char **)&e,
6165 &startinpos,
6166 &endinpos,
6167 &exc,
6168 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006169 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006170 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171 }
6172
Antoine Pitrou63065d72012-05-15 23:48:04 +02006173End:
Walter Dörwald69652032004-09-07 20:24:22 +00006174 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006175 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00006176
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006177 Py_XDECREF(errorHandler);
6178 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006179 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180
Benjamin Peterson29060642009-01-31 22:14:21 +00006181 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006182 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006183 Py_XDECREF(errorHandler);
6184 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185 return NULL;
6186}
6187
Tim Peters772747b2001-08-09 22:21:55 +00006188PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006189_PyUnicode_EncodeUTF16(PyObject *str,
6190 const char *errors,
6191 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006193 enum PyUnicode_Kind kind;
6194 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006195 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006196 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006197 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006198 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02006199#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006200 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006201#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006202 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006203#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006204 const char *encoding;
6205 Py_ssize_t nsize, pos;
6206 PyObject *errorHandler = NULL;
6207 PyObject *exc = NULL;
6208 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006209
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006210 if (!PyUnicode_Check(str)) {
6211 PyErr_BadArgument();
6212 return NULL;
6213 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006214 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006215 return NULL;
6216 kind = PyUnicode_KIND(str);
6217 data = PyUnicode_DATA(str);
6218 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006219
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006220 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006221 if (kind == PyUnicode_4BYTE_KIND) {
6222 const Py_UCS4 *in = (const Py_UCS4 *)data;
6223 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006224 while (in < end) {
6225 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006226 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006227 }
6228 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006229 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006230 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006231 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006232 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006233 nsize = len + pairs + (byteorder == 0);
6234 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006235 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006237 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006239 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006240 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006241 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006242 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006243 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006244 }
6245 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006246 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006247 }
Tim Peters772747b2001-08-09 22:21:55 +00006248
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006249 if (kind == PyUnicode_1BYTE_KIND) {
6250 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6251 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006252 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006253
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006254 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006255 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006256 }
6257 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006258 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006259 }
6260 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006261 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006262 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006263
6264 pos = 0;
6265 while (pos < len) {
6266 Py_ssize_t repsize, moreunits;
6267
6268 if (kind == PyUnicode_2BYTE_KIND) {
6269 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6270 &out, native_ordering);
6271 }
6272 else {
6273 assert(kind == PyUnicode_4BYTE_KIND);
6274 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6275 &out, native_ordering);
6276 }
6277 if (pos == len)
6278 break;
6279
6280 rep = unicode_encode_call_errorhandler(
6281 errors, &errorHandler,
6282 encoding, "surrogates not allowed",
6283 str, &exc, pos, pos + 1, &pos);
6284 if (!rep)
6285 goto error;
6286
6287 if (PyBytes_Check(rep)) {
6288 repsize = PyBytes_GET_SIZE(rep);
6289 if (repsize & 1) {
6290 raise_encode_exception(&exc, encoding,
6291 str, pos - 1, pos,
6292 "surrogates not allowed");
6293 goto error;
6294 }
6295 moreunits = repsize / 2;
6296 }
6297 else {
6298 assert(PyUnicode_Check(rep));
6299 if (PyUnicode_READY(rep) < 0)
6300 goto error;
6301 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6302 if (!PyUnicode_IS_ASCII(rep)) {
6303 raise_encode_exception(&exc, encoding,
6304 str, pos - 1, pos,
6305 "surrogates not allowed");
6306 goto error;
6307 }
6308 }
6309
6310 /* two bytes are reserved for each surrogate */
6311 if (moreunits > 1) {
6312 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006313 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006314 /* integer overflow */
6315 PyErr_NoMemory();
6316 goto error;
6317 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006318 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006319 goto error;
6320 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6321 }
6322
6323 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006324 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006325 out += moreunits;
6326 } else /* rep is unicode */ {
6327 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6328 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6329 &out, native_ordering);
6330 }
6331
6332 Py_CLEAR(rep);
6333 }
6334
6335 /* Cut back to size actually needed. This is necessary for, for example,
6336 encoding of a string containing isolated surrogates and the 'ignore' handler
6337 is used. */
6338 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6339 if (nsize != PyBytes_GET_SIZE(v))
6340 _PyBytes_Resize(&v, nsize);
6341 Py_XDECREF(errorHandler);
6342 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006343 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006344 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006345 error:
6346 Py_XDECREF(rep);
6347 Py_XDECREF(errorHandler);
6348 Py_XDECREF(exc);
6349 Py_XDECREF(v);
6350 return NULL;
6351#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006352}
6353
Alexander Belopolsky40018472011-02-26 01:02:56 +00006354PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006355PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6356 Py_ssize_t size,
6357 const char *errors,
6358 int byteorder)
6359{
6360 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006361 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006362 if (tmp == NULL)
6363 return NULL;
6364 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6365 Py_DECREF(tmp);
6366 return result;
6367}
6368
6369PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006370PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006371{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006372 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373}
6374
6375/* --- Unicode Escape Codec ----------------------------------------------- */
6376
Victor Stinner47e1afd2020-10-26 16:43:47 +01006377static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006378
Alexander Belopolsky40018472011-02-26 01:02:56 +00006379PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006380_PyUnicode_DecodeUnicodeEscape(const char *s,
6381 Py_ssize_t size,
6382 const char *errors,
6383 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006384{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006385 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006386 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006388 PyObject *errorHandler = NULL;
6389 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006390
Eric V. Smith42454af2016-10-31 09:22:08 -04006391 // so we can remember if we've seen an invalid escape char or not
6392 *first_invalid_escape = NULL;
6393
Victor Stinner62ec3312016-09-06 17:04:34 -07006394 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006395 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006396 }
6397 /* Escaped strings will always be longer than the resulting
6398 Unicode string, so we start with size here and then reduce the
6399 length after conversion to the true value.
6400 (but if the error callback returns a long replacement string
6401 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006402 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006403 writer.min_length = size;
6404 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6405 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006406 }
6407
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408 end = s + size;
6409 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006410 unsigned char c = (unsigned char) *s++;
6411 Py_UCS4 ch;
6412 int count;
6413 Py_ssize_t startinpos;
6414 Py_ssize_t endinpos;
6415 const char *message;
6416
6417#define WRITE_ASCII_CHAR(ch) \
6418 do { \
6419 assert(ch <= 127); \
6420 assert(writer.pos < writer.size); \
6421 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6422 } while(0)
6423
6424#define WRITE_CHAR(ch) \
6425 do { \
6426 if (ch <= writer.maxchar) { \
6427 assert(writer.pos < writer.size); \
6428 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6429 } \
6430 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6431 goto onError; \
6432 } \
6433 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434
6435 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006436 if (c != '\\') {
6437 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438 continue;
6439 }
6440
Victor Stinner62ec3312016-09-06 17:04:34 -07006441 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006443 if (s >= end) {
6444 message = "\\ at end of string";
6445 goto error;
6446 }
6447 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006448
Victor Stinner62ec3312016-09-06 17:04:34 -07006449 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006450 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451
Benjamin Peterson29060642009-01-31 22:14:21 +00006452 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006453 case '\n': continue;
6454 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6455 case '\'': WRITE_ASCII_CHAR('\''); continue;
6456 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6457 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006458 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006459 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6460 case 't': WRITE_ASCII_CHAR('\t'); continue;
6461 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6462 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006463 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006464 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006465 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006466 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467
Benjamin Peterson29060642009-01-31 22:14:21 +00006468 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469 case '0': case '1': case '2': case '3':
6470 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006471 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006472 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006473 ch = (ch<<3) + *s++ - '0';
6474 if (s < end && '0' <= *s && *s <= '7') {
6475 ch = (ch<<3) + *s++ - '0';
6476 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006477 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006478 WRITE_CHAR(ch);
6479 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480
Benjamin Peterson29060642009-01-31 22:14:21 +00006481 /* hex escapes */
6482 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006483 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006484 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006485 message = "truncated \\xXX escape";
6486 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006487
Benjamin Peterson29060642009-01-31 22:14:21 +00006488 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006489 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006490 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006491 message = "truncated \\uXXXX escape";
6492 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493
Benjamin Peterson29060642009-01-31 22:14:21 +00006494 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006495 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006496 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006497 message = "truncated \\UXXXXXXXX escape";
6498 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006499 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006500 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006501 ch <<= 4;
6502 if (c >= '0' && c <= '9') {
6503 ch += c - '0';
6504 }
6505 else if (c >= 'a' && c <= 'f') {
6506 ch += c - ('a' - 10);
6507 }
6508 else if (c >= 'A' && c <= 'F') {
6509 ch += c - ('A' - 10);
6510 }
6511 else {
6512 break;
6513 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006514 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006515 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006516 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006517 }
6518
6519 /* when we get here, ch is a 32-bit unicode character */
6520 if (ch > MAX_UNICODE) {
6521 message = "illegal Unicode character";
6522 goto error;
6523 }
6524
6525 WRITE_CHAR(ch);
6526 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006527
Benjamin Peterson29060642009-01-31 22:14:21 +00006528 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006529 case 'N':
Victor Stinner47e1afd2020-10-26 16:43:47 +01006530 if (ucnhash_capi == NULL) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006531 /* load the unicode data module */
Victor Stinner47e1afd2020-10-26 16:43:47 +01006532 ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006533 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner47e1afd2020-10-26 16:43:47 +01006534 if (ucnhash_capi == NULL) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006535 PyErr_SetString(
6536 PyExc_UnicodeError,
6537 "\\N escapes not supported (can't load unicodedata module)"
6538 );
6539 goto onError;
6540 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006541 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006542
6543 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006544 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006545 const char *start = ++s;
6546 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006547 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006548 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006549 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006550 namelen = s - start;
6551 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006552 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006553 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006554 ch = 0xffffffff; /* in case 'getcode' messes up */
6555 if (namelen <= INT_MAX &&
Victor Stinner920cb642020-10-26 19:19:36 +01006556 ucnhash_capi->getcode(start, (int)namelen,
Victor Stinner62ec3312016-09-06 17:04:34 -07006557 &ch, 0)) {
6558 assert(ch <= MAX_UNICODE);
6559 WRITE_CHAR(ch);
6560 continue;
6561 }
6562 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006563 }
6564 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006565 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006566
6567 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006568 if (*first_invalid_escape == NULL) {
6569 *first_invalid_escape = s-1; /* Back up one char, since we've
6570 already incremented s. */
6571 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006572 WRITE_ASCII_CHAR('\\');
6573 WRITE_CHAR(c);
6574 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006575 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006576
6577 error:
6578 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006579 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006580 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006581 errors, &errorHandler,
6582 "unicodeescape", message,
6583 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006584 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006585 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006586 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006587 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006588
6589#undef WRITE_ASCII_CHAR
6590#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006591 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006592
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006593 Py_XDECREF(errorHandler);
6594 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006595 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006596
Benjamin Peterson29060642009-01-31 22:14:21 +00006597 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006598 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006599 Py_XDECREF(errorHandler);
6600 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601 return NULL;
6602}
6603
Eric V. Smith42454af2016-10-31 09:22:08 -04006604PyObject *
6605PyUnicode_DecodeUnicodeEscape(const char *s,
6606 Py_ssize_t size,
6607 const char *errors)
6608{
6609 const char *first_invalid_escape;
6610 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6611 &first_invalid_escape);
6612 if (result == NULL)
6613 return NULL;
6614 if (first_invalid_escape != NULL) {
6615 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6616 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006617 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006618 Py_DECREF(result);
6619 return NULL;
6620 }
6621 }
6622 return result;
6623}
6624
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006625/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006626
Alexander Belopolsky40018472011-02-26 01:02:56 +00006627PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006628PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006629{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006630 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006631 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006632 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006633 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006634 const void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006635 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006636
Ezio Melottie7f90372012-10-05 03:33:31 +03006637 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006638 escape.
6639
Ezio Melottie7f90372012-10-05 03:33:31 +03006640 For UCS1 strings it's '\xxx', 4 bytes per source character.
6641 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6642 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006643 */
6644
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006645 if (!PyUnicode_Check(unicode)) {
6646 PyErr_BadArgument();
6647 return NULL;
6648 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006649 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006650 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006651 }
Victor Stinner358af132015-10-12 22:36:57 +02006652
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006653 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006654 if (len == 0) {
6655 return PyBytes_FromStringAndSize(NULL, 0);
6656 }
6657
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006658 kind = PyUnicode_KIND(unicode);
6659 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006660 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6661 bytes, and 1 byte characters 4. */
6662 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006663 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006664 return PyErr_NoMemory();
6665 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006666 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006667 if (repr == NULL) {
6668 return NULL;
6669 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006670
Victor Stinner62ec3312016-09-06 17:04:34 -07006671 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006672 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006673 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006674
Victor Stinner62ec3312016-09-06 17:04:34 -07006675 /* U+0000-U+00ff range */
6676 if (ch < 0x100) {
6677 if (ch >= ' ' && ch < 127) {
6678 if (ch != '\\') {
6679 /* Copy printable US ASCII as-is */
6680 *p++ = (char) ch;
6681 }
6682 /* Escape backslashes */
6683 else {
6684 *p++ = '\\';
6685 *p++ = '\\';
6686 }
6687 }
Victor Stinner358af132015-10-12 22:36:57 +02006688
Victor Stinner62ec3312016-09-06 17:04:34 -07006689 /* Map special whitespace to '\t', \n', '\r' */
6690 else if (ch == '\t') {
6691 *p++ = '\\';
6692 *p++ = 't';
6693 }
6694 else if (ch == '\n') {
6695 *p++ = '\\';
6696 *p++ = 'n';
6697 }
6698 else if (ch == '\r') {
6699 *p++ = '\\';
6700 *p++ = 'r';
6701 }
6702
6703 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6704 else {
6705 *p++ = '\\';
6706 *p++ = 'x';
6707 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6708 *p++ = Py_hexdigits[ch & 0x000F];
6709 }
Tim Petersced69f82003-09-16 20:30:58 +00006710 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006711 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006712 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006713 *p++ = '\\';
6714 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006715 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6716 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6717 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6718 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006720 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6721 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006722
Victor Stinner62ec3312016-09-06 17:04:34 -07006723 /* Make sure that the first two digits are zero */
6724 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006725 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006726 *p++ = 'U';
6727 *p++ = '0';
6728 *p++ = '0';
6729 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6730 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6731 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6732 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6733 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6734 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006735 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737
Victor Stinner62ec3312016-09-06 17:04:34 -07006738 assert(p - PyBytes_AS_STRING(repr) > 0);
6739 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6740 return NULL;
6741 }
6742 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743}
6744
Alexander Belopolsky40018472011-02-26 01:02:56 +00006745PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006746PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6747 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006748{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006749 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006750 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006751 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006753 }
6754
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006755 result = PyUnicode_AsUnicodeEscapeString(tmp);
6756 Py_DECREF(tmp);
6757 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758}
6759
6760/* --- Raw Unicode Escape Codec ------------------------------------------- */
6761
Alexander Belopolsky40018472011-02-26 01:02:56 +00006762PyObject *
6763PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006764 Py_ssize_t size,
6765 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006767 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006768 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006769 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006770 PyObject *errorHandler = NULL;
6771 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006772
Victor Stinner62ec3312016-09-06 17:04:34 -07006773 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006774 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006775 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006776
Guido van Rossumd57fd912000-03-10 22:53:23 +00006777 /* Escaped strings will always be longer than the resulting
6778 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006779 length after conversion to the true value. (But decoding error
6780 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006781 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006782 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006783 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6784 goto onError;
6785 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006786
Guido van Rossumd57fd912000-03-10 22:53:23 +00006787 end = s + size;
6788 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006789 unsigned char c = (unsigned char) *s++;
6790 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006791 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006792 Py_ssize_t startinpos;
6793 Py_ssize_t endinpos;
6794 const char *message;
6795
6796#define WRITE_CHAR(ch) \
6797 do { \
6798 if (ch <= writer.maxchar) { \
6799 assert(writer.pos < writer.size); \
6800 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6801 } \
6802 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6803 goto onError; \
6804 } \
6805 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806
Benjamin Peterson29060642009-01-31 22:14:21 +00006807 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006808 if (c != '\\' || s >= end) {
6809 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006810 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006811 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006812
Victor Stinner62ec3312016-09-06 17:04:34 -07006813 c = (unsigned char) *s++;
6814 if (c == 'u') {
6815 count = 4;
6816 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006817 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006818 else if (c == 'U') {
6819 count = 8;
6820 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006821 }
6822 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006823 assert(writer.pos < writer.size);
6824 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6825 WRITE_CHAR(c);
6826 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006827 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006828 startinpos = s - starts - 2;
6829
6830 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6831 for (ch = 0; count && s < end; ++s, --count) {
6832 c = (unsigned char)*s;
6833 ch <<= 4;
6834 if (c >= '0' && c <= '9') {
6835 ch += c - '0';
6836 }
6837 else if (c >= 'a' && c <= 'f') {
6838 ch += c - ('a' - 10);
6839 }
6840 else if (c >= 'A' && c <= 'F') {
6841 ch += c - ('A' - 10);
6842 }
6843 else {
6844 break;
6845 }
6846 }
6847 if (!count) {
6848 if (ch <= MAX_UNICODE) {
6849 WRITE_CHAR(ch);
6850 continue;
6851 }
6852 message = "\\Uxxxxxxxx out of range";
6853 }
6854
6855 endinpos = s-starts;
6856 writer.min_length = end - s + writer.pos;
6857 if (unicode_decode_call_errorhandler_writer(
6858 errors, &errorHandler,
6859 "rawunicodeescape", message,
6860 &starts, &end, &startinpos, &endinpos, &exc, &s,
6861 &writer)) {
6862 goto onError;
6863 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006864 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006865
6866#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006867 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006868 Py_XDECREF(errorHandler);
6869 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006870 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006871
Benjamin Peterson29060642009-01-31 22:14:21 +00006872 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006873 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006874 Py_XDECREF(errorHandler);
6875 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006877
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878}
6879
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006880
Alexander Belopolsky40018472011-02-26 01:02:56 +00006881PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006882PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006883{
Victor Stinner62ec3312016-09-06 17:04:34 -07006884 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006885 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006886 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006887 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006888 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006889 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006890
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006891 if (!PyUnicode_Check(unicode)) {
6892 PyErr_BadArgument();
6893 return NULL;
6894 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006895 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006896 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006897 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006898 kind = PyUnicode_KIND(unicode);
6899 data = PyUnicode_DATA(unicode);
6900 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006901 if (kind == PyUnicode_1BYTE_KIND) {
6902 return PyBytes_FromStringAndSize(data, len);
6903 }
Victor Stinner0e368262011-11-10 20:12:49 +01006904
Victor Stinner62ec3312016-09-06 17:04:34 -07006905 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6906 bytes, and 1 byte characters 4. */
6907 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006908
Victor Stinner62ec3312016-09-06 17:04:34 -07006909 if (len > PY_SSIZE_T_MAX / expandsize) {
6910 return PyErr_NoMemory();
6911 }
6912 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6913 if (repr == NULL) {
6914 return NULL;
6915 }
6916 if (len == 0) {
6917 return repr;
6918 }
6919
6920 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006921 for (pos = 0; pos < len; pos++) {
6922 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006923
Victor Stinner62ec3312016-09-06 17:04:34 -07006924 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6925 if (ch < 0x100) {
6926 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006927 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006928 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006929 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006930 *p++ = '\\';
6931 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006932 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6933 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6934 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6935 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006936 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006937 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6938 else {
6939 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6940 *p++ = '\\';
6941 *p++ = 'U';
6942 *p++ = '0';
6943 *p++ = '0';
6944 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6945 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6946 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6947 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6948 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6949 *p++ = Py_hexdigits[ch & 15];
6950 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006952
Victor Stinner62ec3312016-09-06 17:04:34 -07006953 assert(p > PyBytes_AS_STRING(repr));
6954 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6955 return NULL;
6956 }
6957 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958}
6959
Alexander Belopolsky40018472011-02-26 01:02:56 +00006960PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006961PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6962 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006964 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006965 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006966 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006967 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006968 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6969 Py_DECREF(tmp);
6970 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006971}
6972
6973/* --- Latin-1 Codec ------------------------------------------------------ */
6974
Alexander Belopolsky40018472011-02-26 01:02:56 +00006975PyObject *
6976PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006977 Py_ssize_t size,
6978 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06006981 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006982}
6983
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006984/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006985static void
6986make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006987 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006988 PyObject *unicode,
6989 Py_ssize_t startpos, Py_ssize_t endpos,
6990 const char *reason)
6991{
6992 if (*exceptionObject == NULL) {
6993 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006994 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006995 encoding, unicode, startpos, endpos, reason);
6996 }
6997 else {
6998 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6999 goto onError;
7000 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
7001 goto onError;
7002 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
7003 goto onError;
7004 return;
7005 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02007006 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01007007 }
7008}
7009
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007010/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007011static void
7012raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007013 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01007014 PyObject *unicode,
7015 Py_ssize_t startpos, Py_ssize_t endpos,
7016 const char *reason)
7017{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007018 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01007019 encoding, unicode, startpos, endpos, reason);
7020 if (*exceptionObject != NULL)
7021 PyCodec_StrictErrors(*exceptionObject);
7022}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007023
7024/* error handling callback helper:
7025 build arguments, call the callback and check the arguments,
7026 put the result into newpos and return the replacement string, which
7027 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007028static PyObject *
7029unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007030 PyObject **errorHandler,
7031 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007032 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007033 Py_ssize_t startpos, Py_ssize_t endpos,
7034 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007035{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02007036 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007037 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007038 PyObject *restuple;
7039 PyObject *resunicode;
7040
7041 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007042 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007043 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007044 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007045 }
7046
Benjamin Petersonbac79492012-01-14 13:34:47 -05007047 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007048 return NULL;
7049 len = PyUnicode_GET_LENGTH(unicode);
7050
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007051 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007052 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007053 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007054 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007055
Petr Viktorinffd97532020-02-11 17:46:57 +01007056 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007057 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007058 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007059 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007060 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007061 Py_DECREF(restuple);
7062 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007063 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007064 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00007065 &resunicode, newpos)) {
7066 Py_DECREF(restuple);
7067 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007068 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007069 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7070 PyErr_SetString(PyExc_TypeError, &argparse[3]);
7071 Py_DECREF(restuple);
7072 return NULL;
7073 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007074 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007075 *newpos = len + *newpos;
7076 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02007077 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007078 Py_DECREF(restuple);
7079 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007080 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007081 Py_INCREF(resunicode);
7082 Py_DECREF(restuple);
7083 return resunicode;
7084}
7085
Alexander Belopolsky40018472011-02-26 01:02:56 +00007086static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007087unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007088 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02007089 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007090{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007091 /* input state */
7092 Py_ssize_t pos=0, size;
7093 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007094 const void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007095 /* pointer into the output */
7096 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007097 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7098 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02007099 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007100 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02007101 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007102 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007103 /* output object */
7104 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007105
Benjamin Petersonbac79492012-01-14 13:34:47 -05007106 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007107 return NULL;
7108 size = PyUnicode_GET_LENGTH(unicode);
7109 kind = PyUnicode_KIND(unicode);
7110 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007111 /* allocate enough for a simple encoding without
7112 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00007113 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00007114 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007115
7116 _PyBytesWriter_Init(&writer);
7117 str = _PyBytesWriter_Alloc(&writer, size);
7118 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00007119 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007120
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007121 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02007122 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007123
Benjamin Peterson29060642009-01-31 22:14:21 +00007124 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02007125 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007126 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02007127 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007128 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007129 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007130 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02007131 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007132 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007133 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007134 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00007135 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02007136
Benjamin Petersona1c1be42014-09-29 18:18:57 -04007137 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00007138 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02007139
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007140 /* Only overallocate the buffer if it's not the last write */
7141 writer.overallocate = (collend < size);
7142
Benjamin Peterson29060642009-01-31 22:14:21 +00007143 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02007144 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007145 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02007146
7147 switch (error_handler) {
7148 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007149 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007150 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02007151
7152 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02007153 memset(str, '?', collend - collstart);
7154 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02007155 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02007156 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007157 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007158 break;
Victor Stinner50149202015-09-22 00:26:54 +02007159
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007160 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007161 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007162 writer.min_size -= (collend - collstart);
7163 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007164 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007165 if (str == NULL)
7166 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007167 pos = collend;
7168 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007169
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007170 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007171 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007172 writer.min_size -= (collend - collstart);
7173 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007174 unicode, collstart, collend);
7175 if (str == NULL)
7176 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007177 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007178 break;
Victor Stinner50149202015-09-22 00:26:54 +02007179
Victor Stinnerc3713e92015-09-29 12:32:13 +02007180 case _Py_ERROR_SURROGATEESCAPE:
7181 for (i = collstart; i < collend; ++i) {
7182 ch = PyUnicode_READ(kind, data, i);
7183 if (ch < 0xdc80 || 0xdcff < ch) {
7184 /* Not a UTF-8b surrogate */
7185 break;
7186 }
7187 *str++ = (char)(ch - 0xdc00);
7188 ++pos;
7189 }
7190 if (i >= collend)
7191 break;
7192 collstart = pos;
7193 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02007194 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02007195
Benjamin Peterson29060642009-01-31 22:14:21 +00007196 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007197 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7198 encoding, reason, unicode, &exc,
7199 collstart, collend, &newpos);
7200 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007201 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02007202
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007203 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08007204 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02007205
Victor Stinner6bd525b2015-10-09 13:10:05 +02007206 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007207 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007208 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007209 PyBytes_AS_STRING(rep),
7210 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007211 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007212 else {
7213 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007214
Victor Stinner6bd525b2015-10-09 13:10:05 +02007215 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007216 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007217
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007218 if (limit == 256 ?
7219 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7220 !PyUnicode_IS_ASCII(rep))
7221 {
7222 /* Not all characters are smaller than limit */
7223 raise_encode_exception(&exc, encoding, unicode,
7224 collstart, collend, reason);
7225 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007226 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007227 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7228 str = _PyBytesWriter_WriteBytes(&writer, str,
7229 PyUnicode_DATA(rep),
7230 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007231 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007232 if (str == NULL)
7233 goto onError;
7234
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007235 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007236 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007237 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007238
7239 /* If overallocation was disabled, ensure that it was the last
7240 write. Otherwise, we missed an optimization */
7241 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007242 }
7243 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007244
Victor Stinner50149202015-09-22 00:26:54 +02007245 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007246 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007247 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007248
7249 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007250 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007251 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007252 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007253 Py_XDECREF(exc);
7254 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007255}
7256
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007257/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007258PyObject *
7259PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007260 Py_ssize_t size,
7261 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007262{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007263 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007264 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007265 if (unicode == NULL)
7266 return NULL;
7267 result = unicode_encode_ucs1(unicode, errors, 256);
7268 Py_DECREF(unicode);
7269 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007270}
7271
Alexander Belopolsky40018472011-02-26 01:02:56 +00007272PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007273_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274{
7275 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007276 PyErr_BadArgument();
7277 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007278 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007279 if (PyUnicode_READY(unicode) == -1)
7280 return NULL;
7281 /* Fast path: if it is a one-byte string, construct
7282 bytes object directly. */
7283 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7284 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7285 PyUnicode_GET_LENGTH(unicode));
7286 /* Non-Latin-1 characters present. Defer to above function to
7287 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007288 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007289}
7290
7291PyObject*
7292PyUnicode_AsLatin1String(PyObject *unicode)
7293{
7294 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007295}
7296
7297/* --- 7-bit ASCII Codec -------------------------------------------------- */
7298
Alexander Belopolsky40018472011-02-26 01:02:56 +00007299PyObject *
7300PyUnicode_DecodeASCII(const char *s,
7301 Py_ssize_t size,
7302 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007303{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007304 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007305 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007306 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007307 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007308 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007309
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007311 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007312
Guido van Rossumd57fd912000-03-10 22:53:23 +00007313 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner2f9ada92020-06-24 02:22:21 +02007314 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02007315 return get_latin1_char((unsigned char)s[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02007316 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007317
Inada Naoki770847a2019-06-24 12:30:24 +09007318 // Shortcut for simple case
7319 PyObject *u = PyUnicode_New(size, 127);
7320 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007321 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007322 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007323 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09007324 if (outpos == size) {
7325 return u;
7326 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007327
Inada Naoki770847a2019-06-24 12:30:24 +09007328 _PyUnicodeWriter writer;
7329 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007330 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007331
Inada Naoki770847a2019-06-24 12:30:24 +09007332 s += outpos;
7333 int kind = writer.kind;
7334 void *data = writer.data;
7335 Py_ssize_t startinpos, endinpos;
7336
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007337 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007338 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007339 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007340 PyUnicode_WRITE(kind, data, writer.pos, c);
7341 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007342 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007343 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007344 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007345
7346 /* byte outsize range 0x00..0x7f: call the error handler */
7347
7348 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007349 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007350
7351 switch (error_handler)
7352 {
7353 case _Py_ERROR_REPLACE:
7354 case _Py_ERROR_SURROGATEESCAPE:
7355 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007356 but we may switch to UCS2 at the first write */
7357 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7358 goto onError;
7359 kind = writer.kind;
7360 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007361
7362 if (error_handler == _Py_ERROR_REPLACE)
7363 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7364 else
7365 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7366 writer.pos++;
7367 ++s;
7368 break;
7369
7370 case _Py_ERROR_IGNORE:
7371 ++s;
7372 break;
7373
7374 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007375 startinpos = s-starts;
7376 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007377 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007378 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007379 "ascii", "ordinal not in range(128)",
7380 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007381 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007382 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007383 kind = writer.kind;
7384 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007385 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007386 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007387 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007388 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007389 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007390
Benjamin Peterson29060642009-01-31 22:14:21 +00007391 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007392 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007393 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007394 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007395 return NULL;
7396}
7397
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007398/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007399PyObject *
7400PyUnicode_EncodeASCII(const Py_UNICODE *p,
7401 Py_ssize_t size,
7402 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007403{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007404 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007405 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007406 if (unicode == NULL)
7407 return NULL;
7408 result = unicode_encode_ucs1(unicode, errors, 128);
7409 Py_DECREF(unicode);
7410 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007411}
7412
Alexander Belopolsky40018472011-02-26 01:02:56 +00007413PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007414_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007415{
7416 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007417 PyErr_BadArgument();
7418 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007419 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007420 if (PyUnicode_READY(unicode) == -1)
7421 return NULL;
7422 /* Fast path: if it is an ASCII-only string, construct bytes object
7423 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007424 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007425 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7426 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007427 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007428}
7429
7430PyObject *
7431PyUnicode_AsASCIIString(PyObject *unicode)
7432{
7433 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007434}
7435
Steve Dowercc16be82016-09-08 10:35:16 -07007436#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007437
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007438/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007439
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007440#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007441#define NEED_RETRY
7442#endif
7443
Steve Dower7ebdda02019-08-21 16:22:33 -07007444/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7445 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7446 both cases also and avoids partial characters overrunning the
7447 length limit in MultiByteToWideChar on Windows */
7448#define DECODING_CHUNK_SIZE (INT_MAX/4)
7449
Victor Stinner3a50e702011-10-18 21:21:00 +02007450#ifndef WC_ERR_INVALID_CHARS
7451# define WC_ERR_INVALID_CHARS 0x0080
7452#endif
7453
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007454static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007455code_page_name(UINT code_page, PyObject **obj)
7456{
7457 *obj = NULL;
7458 if (code_page == CP_ACP)
7459 return "mbcs";
7460 if (code_page == CP_UTF7)
7461 return "CP_UTF7";
7462 if (code_page == CP_UTF8)
7463 return "CP_UTF8";
7464
7465 *obj = PyBytes_FromFormat("cp%u", code_page);
7466 if (*obj == NULL)
7467 return NULL;
7468 return PyBytes_AS_STRING(*obj);
7469}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007470
Victor Stinner3a50e702011-10-18 21:21:00 +02007471static DWORD
7472decode_code_page_flags(UINT code_page)
7473{
7474 if (code_page == CP_UTF7) {
7475 /* The CP_UTF7 decoder only supports flags=0 */
7476 return 0;
7477 }
7478 else
7479 return MB_ERR_INVALID_CHARS;
7480}
7481
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007482/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007483 * Decode a byte string from a Windows code page into unicode object in strict
7484 * mode.
7485 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007486 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7487 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007488 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007489static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007490decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007491 wchar_t **buf,
7492 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007493 const char *in,
7494 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007495{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007496 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007497 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007498 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007499
7500 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007501 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007502 while ((outsize = MultiByteToWideChar(code_page, flags,
7503 in, insize, NULL, 0)) <= 0)
7504 {
7505 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7506 goto error;
7507 }
7508 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7509 flags = 0;
7510 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007511
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007512 /* Extend a wchar_t* buffer */
7513 Py_ssize_t n = *bufsize; /* Get the current length */
7514 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7515 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007516 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007517 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007518
7519 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007520 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7521 if (outsize <= 0)
7522 goto error;
7523 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007524
Victor Stinner3a50e702011-10-18 21:21:00 +02007525error:
7526 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7527 return -2;
7528 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007529 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007530}
7531
Victor Stinner3a50e702011-10-18 21:21:00 +02007532/*
7533 * Decode a byte string from a code page into unicode object with an error
7534 * handler.
7535 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007536 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007537 * UnicodeDecodeError exception and returns -1 on error.
7538 */
7539static int
7540decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007541 wchar_t **buf,
7542 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007543 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007544 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007545{
7546 const char *startin = in;
7547 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007548 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007549 /* Ideally, we should get reason from FormatMessage. This is the Windows
7550 2000 English version of the message. */
7551 const char *reason = "No mapping for the Unicode character exists "
7552 "in the target code page.";
7553 /* each step cannot decode more than 1 character, but a character can be
7554 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007555 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007556 int insize;
7557 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007558 PyObject *errorHandler = NULL;
7559 PyObject *exc = NULL;
7560 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007561 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007562 DWORD err;
7563 int ret = -1;
7564
7565 assert(size > 0);
7566
7567 encoding = code_page_name(code_page, &encoding_obj);
7568 if (encoding == NULL)
7569 return -1;
7570
Victor Stinner7d00cc12014-03-17 23:08:06 +01007571 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007572 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7573 UnicodeDecodeError. */
7574 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7575 if (exc != NULL) {
7576 PyCodec_StrictErrors(exc);
7577 Py_CLEAR(exc);
7578 }
7579 goto error;
7580 }
7581
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007582 /* Extend a wchar_t* buffer */
7583 Py_ssize_t n = *bufsize; /* Get the current length */
7584 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7585 PyErr_NoMemory();
7586 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007587 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007588 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7589 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007590 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007591 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007592
7593 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007594 while (in < endin)
7595 {
7596 /* Decode a character */
7597 insize = 1;
7598 do
7599 {
7600 outsize = MultiByteToWideChar(code_page, flags,
7601 in, insize,
7602 buffer, Py_ARRAY_LENGTH(buffer));
7603 if (outsize > 0)
7604 break;
7605 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007606 if (err == ERROR_INVALID_FLAGS && flags) {
7607 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7608 flags = 0;
7609 continue;
7610 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007611 if (err != ERROR_NO_UNICODE_TRANSLATION
7612 && err != ERROR_INSUFFICIENT_BUFFER)
7613 {
7614 PyErr_SetFromWindowsErr(0);
7615 goto error;
7616 }
7617 insize++;
7618 }
7619 /* 4=maximum length of a UTF-8 sequence */
7620 while (insize <= 4 && (in + insize) <= endin);
7621
7622 if (outsize <= 0) {
7623 Py_ssize_t startinpos, endinpos, outpos;
7624
Victor Stinner7d00cc12014-03-17 23:08:06 +01007625 /* last character in partial decode? */
7626 if (in + insize >= endin && !final)
7627 break;
7628
Victor Stinner3a50e702011-10-18 21:21:00 +02007629 startinpos = in - startin;
7630 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007631 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007632 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007633 errors, &errorHandler,
7634 encoding, reason,
7635 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007636 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007637 {
7638 goto error;
7639 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007640 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007641 }
7642 else {
7643 in += insize;
7644 memcpy(out, buffer, outsize * sizeof(wchar_t));
7645 out += outsize;
7646 }
7647 }
7648
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007649 /* Shrink the buffer */
7650 assert(out - *buf <= *bufsize);
7651 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007652 /* (in - startin) <= size and size is an int */
7653 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007654
7655error:
7656 Py_XDECREF(encoding_obj);
7657 Py_XDECREF(errorHandler);
7658 Py_XDECREF(exc);
7659 return ret;
7660}
7661
Victor Stinner3a50e702011-10-18 21:21:00 +02007662static PyObject *
7663decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007664 const char *s, Py_ssize_t size,
7665 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007666{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007667 wchar_t *buf = NULL;
7668 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007669 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007670
Victor Stinner3a50e702011-10-18 21:21:00 +02007671 if (code_page < 0) {
7672 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7673 return NULL;
7674 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007675 if (size < 0) {
7676 PyErr_BadInternalCall();
7677 return NULL;
7678 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007679
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007680 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007681 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007682
Victor Stinner76a31a62011-11-04 00:05:13 +01007683 do
7684 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007685#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007686 if (size > DECODING_CHUNK_SIZE) {
7687 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007688 final = 0;
7689 done = 0;
7690 }
7691 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007692#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007693 {
7694 chunk_size = (int)size;
7695 final = (consumed == NULL);
7696 done = 1;
7697 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007698
Victor Stinner76a31a62011-11-04 00:05:13 +01007699 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007700 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007701 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007702 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007703 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007704
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007705 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007706 s, chunk_size);
7707 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007708 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007709 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007710 errors, final);
7711 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007712
7713 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007714 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007715 return NULL;
7716 }
7717
7718 if (consumed)
7719 *consumed += converted;
7720
7721 s += converted;
7722 size -= converted;
7723 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007724
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007725 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7726 PyMem_Free(buf);
7727 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007728}
7729
Alexander Belopolsky40018472011-02-26 01:02:56 +00007730PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007731PyUnicode_DecodeCodePageStateful(int code_page,
7732 const char *s,
7733 Py_ssize_t size,
7734 const char *errors,
7735 Py_ssize_t *consumed)
7736{
7737 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7738}
7739
7740PyObject *
7741PyUnicode_DecodeMBCSStateful(const char *s,
7742 Py_ssize_t size,
7743 const char *errors,
7744 Py_ssize_t *consumed)
7745{
7746 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7747}
7748
7749PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007750PyUnicode_DecodeMBCS(const char *s,
7751 Py_ssize_t size,
7752 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007753{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007754 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7755}
7756
Victor Stinner3a50e702011-10-18 21:21:00 +02007757static DWORD
7758encode_code_page_flags(UINT code_page, const char *errors)
7759{
7760 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007761 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007762 }
7763 else if (code_page == CP_UTF7) {
7764 /* CP_UTF7 only supports flags=0 */
7765 return 0;
7766 }
7767 else {
7768 if (errors != NULL && strcmp(errors, "replace") == 0)
7769 return 0;
7770 else
7771 return WC_NO_BEST_FIT_CHARS;
7772 }
7773}
7774
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007775/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007776 * Encode a Unicode string to a Windows code page into a byte string in strict
7777 * mode.
7778 *
7779 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007780 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007781 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007782static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007783encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007784 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007785 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007786{
Victor Stinner554f3f02010-06-16 23:33:54 +00007787 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007788 BOOL *pusedDefaultChar = &usedDefaultChar;
7789 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007790 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007791 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007792 const DWORD flags = encode_code_page_flags(code_page, NULL);
7793 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007794 /* Create a substring so that we can get the UTF-16 representation
7795 of just the slice under consideration. */
7796 PyObject *substring;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007797 int ret = -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007798
Martin v. Löwis3d325192011-11-04 18:23:06 +01007799 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007800
Victor Stinner3a50e702011-10-18 21:21:00 +02007801 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007802 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007803 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007804 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007805
Victor Stinner2fc507f2011-11-04 20:06:39 +01007806 substring = PyUnicode_Substring(unicode, offset, offset+len);
7807 if (substring == NULL)
7808 return -1;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007809#if USE_UNICODE_WCHAR_CACHE
7810_Py_COMP_DIAG_PUSH
7811_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Victor Stinner2fc507f2011-11-04 20:06:39 +01007812 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7813 if (p == NULL) {
7814 Py_DECREF(substring);
7815 return -1;
7816 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007817_Py_COMP_DIAG_POP
7818#else /* USE_UNICODE_WCHAR_CACHE */
7819 p = PyUnicode_AsWideCharString(substring, &size);
7820 Py_CLEAR(substring);
7821 if (p == NULL) {
7822 return -1;
7823 }
7824#endif /* USE_UNICODE_WCHAR_CACHE */
Victor Stinner9f067f42013-06-05 00:21:31 +02007825 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007826
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007827 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007828 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007829 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007830 NULL, 0,
7831 NULL, pusedDefaultChar);
7832 if (outsize <= 0)
7833 goto error;
7834 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007835 if (pusedDefaultChar && *pusedDefaultChar) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007836 ret = -2;
7837 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007838 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007839
Victor Stinner3a50e702011-10-18 21:21:00 +02007840 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007841 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007842 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007843 if (*outbytes == NULL) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007844 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007845 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007846 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007847 }
7848 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007849 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007850 const Py_ssize_t n = PyBytes_Size(*outbytes);
7851 if (outsize > PY_SSIZE_T_MAX - n) {
7852 PyErr_NoMemory();
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007853 goto done;
Victor Stinner3a50e702011-10-18 21:21:00 +02007854 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007855 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007856 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007857 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007858 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007859 }
7860
7861 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007862 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007863 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007864 out, outsize,
7865 NULL, pusedDefaultChar);
7866 if (outsize <= 0)
7867 goto error;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007868 if (pusedDefaultChar && *pusedDefaultChar) {
7869 ret = -2;
7870 goto done;
7871 }
7872 ret = 0;
7873
7874done:
7875#if USE_UNICODE_WCHAR_CACHE
7876 Py_DECREF(substring);
7877#else /* USE_UNICODE_WCHAR_CACHE */
7878 PyMem_Free(p);
7879#endif /* USE_UNICODE_WCHAR_CACHE */
7880 return ret;
Victor Stinner554f3f02010-06-16 23:33:54 +00007881
Victor Stinner3a50e702011-10-18 21:21:00 +02007882error:
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007883 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7884 ret = -2;
7885 goto done;
7886 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007887 PyErr_SetFromWindowsErr(0);
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007888 goto done;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007889}
7890
Victor Stinner3a50e702011-10-18 21:21:00 +02007891/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007892 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007893 * error handler.
7894 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007895 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007896 * -1 on other error.
7897 */
7898static int
7899encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007900 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007901 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007902{
Victor Stinner3a50e702011-10-18 21:21:00 +02007903 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007904 Py_ssize_t pos = unicode_offset;
7905 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007906 /* Ideally, we should get reason from FormatMessage. This is the Windows
7907 2000 English version of the message. */
7908 const char *reason = "invalid character";
7909 /* 4=maximum length of a UTF-8 sequence */
7910 char buffer[4];
7911 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7912 Py_ssize_t outsize;
7913 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007914 PyObject *errorHandler = NULL;
7915 PyObject *exc = NULL;
7916 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007917 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007918 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007919 PyObject *rep;
7920 int ret = -1;
7921
7922 assert(insize > 0);
7923
7924 encoding = code_page_name(code_page, &encoding_obj);
7925 if (encoding == NULL)
7926 return -1;
7927
7928 if (errors == NULL || strcmp(errors, "strict") == 0) {
7929 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7930 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007931 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007932 if (exc != NULL) {
7933 PyCodec_StrictErrors(exc);
7934 Py_DECREF(exc);
7935 }
7936 Py_XDECREF(encoding_obj);
7937 return -1;
7938 }
7939
7940 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7941 pusedDefaultChar = &usedDefaultChar;
7942 else
7943 pusedDefaultChar = NULL;
7944
7945 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7946 PyErr_NoMemory();
7947 goto error;
7948 }
7949 outsize = insize * Py_ARRAY_LENGTH(buffer);
7950
7951 if (*outbytes == NULL) {
7952 /* Create string object */
7953 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7954 if (*outbytes == NULL)
7955 goto error;
7956 out = PyBytes_AS_STRING(*outbytes);
7957 }
7958 else {
7959 /* Extend string object */
7960 Py_ssize_t n = PyBytes_Size(*outbytes);
7961 if (n > PY_SSIZE_T_MAX - outsize) {
7962 PyErr_NoMemory();
7963 goto error;
7964 }
7965 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7966 goto error;
7967 out = PyBytes_AS_STRING(*outbytes) + n;
7968 }
7969
7970 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007971 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007972 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007973 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7974 wchar_t chars[2];
7975 int charsize;
7976 if (ch < 0x10000) {
7977 chars[0] = (wchar_t)ch;
7978 charsize = 1;
7979 }
7980 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007981 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7982 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007983 charsize = 2;
7984 }
7985
Victor Stinner3a50e702011-10-18 21:21:00 +02007986 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007987 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007988 buffer, Py_ARRAY_LENGTH(buffer),
7989 NULL, pusedDefaultChar);
7990 if (outsize > 0) {
7991 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7992 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007993 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007994 memcpy(out, buffer, outsize);
7995 out += outsize;
7996 continue;
7997 }
7998 }
7999 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
8000 PyErr_SetFromWindowsErr(0);
8001 goto error;
8002 }
8003
Victor Stinner3a50e702011-10-18 21:21:00 +02008004 rep = unicode_encode_call_errorhandler(
8005 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01008006 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008007 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02008008 if (rep == NULL)
8009 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01008010 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02008011
8012 if (PyBytes_Check(rep)) {
8013 outsize = PyBytes_GET_SIZE(rep);
8014 if (outsize != 1) {
8015 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8016 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
8017 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8018 Py_DECREF(rep);
8019 goto error;
8020 }
8021 out = PyBytes_AS_STRING(*outbytes) + offset;
8022 }
8023 memcpy(out, PyBytes_AS_STRING(rep), outsize);
8024 out += outsize;
8025 }
8026 else {
8027 Py_ssize_t i;
8028 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008029 const void *data;
Victor Stinner3a50e702011-10-18 21:21:00 +02008030
Benjamin Petersonbac79492012-01-14 13:34:47 -05008031 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02008032 Py_DECREF(rep);
8033 goto error;
8034 }
8035
8036 outsize = PyUnicode_GET_LENGTH(rep);
8037 if (outsize != 1) {
8038 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8039 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
8040 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8041 Py_DECREF(rep);
8042 goto error;
8043 }
8044 out = PyBytes_AS_STRING(*outbytes) + offset;
8045 }
8046 kind = PyUnicode_KIND(rep);
8047 data = PyUnicode_DATA(rep);
8048 for (i=0; i < outsize; i++) {
8049 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8050 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008051 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008052 encoding, unicode,
8053 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02008054 "unable to encode error handler result to ASCII");
8055 Py_DECREF(rep);
8056 goto error;
8057 }
8058 *out = (unsigned char)ch;
8059 out++;
8060 }
8061 }
8062 Py_DECREF(rep);
8063 }
8064 /* write a NUL byte */
8065 *out = 0;
8066 outsize = out - PyBytes_AS_STRING(*outbytes);
8067 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
8068 if (_PyBytes_Resize(outbytes, outsize) < 0)
8069 goto error;
8070 ret = 0;
8071
8072error:
8073 Py_XDECREF(encoding_obj);
8074 Py_XDECREF(errorHandler);
8075 Py_XDECREF(exc);
8076 return ret;
8077}
8078
Victor Stinner3a50e702011-10-18 21:21:00 +02008079static PyObject *
8080encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01008081 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02008082 const char *errors)
8083{
Martin v. Löwis3d325192011-11-04 18:23:06 +01008084 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02008085 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01008086 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01008087 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01008088
Victor Stinner29dacf22015-01-26 16:41:32 +01008089 if (!PyUnicode_Check(unicode)) {
8090 PyErr_BadArgument();
8091 return NULL;
8092 }
8093
Benjamin Petersonbac79492012-01-14 13:34:47 -05008094 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01008095 return NULL;
8096 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00008097
Victor Stinner3a50e702011-10-18 21:21:00 +02008098 if (code_page < 0) {
8099 PyErr_SetString(PyExc_ValueError, "invalid code page number");
8100 return NULL;
8101 }
8102
Martin v. Löwis3d325192011-11-04 18:23:06 +01008103 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01008104 return PyBytes_FromStringAndSize(NULL, 0);
8105
Victor Stinner7581cef2011-11-03 22:32:33 +01008106 offset = 0;
8107 do
8108 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008109#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07008110 if (len > DECODING_CHUNK_SIZE) {
8111 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01008112 done = 0;
8113 }
Victor Stinner7581cef2011-11-03 22:32:33 +01008114 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008115#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01008116 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01008117 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01008118 done = 1;
8119 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01008120
Victor Stinner76a31a62011-11-04 00:05:13 +01008121 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008122 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01008123 errors);
8124 if (ret == -2)
8125 ret = encode_code_page_errors(code_page, &outbytes,
8126 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008127 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01008128 if (ret < 0) {
8129 Py_XDECREF(outbytes);
8130 return NULL;
8131 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008132
Victor Stinner7581cef2011-11-03 22:32:33 +01008133 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01008134 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01008135 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008136
Victor Stinner3a50e702011-10-18 21:21:00 +02008137 return outbytes;
8138}
8139
8140PyObject *
8141PyUnicode_EncodeMBCS(const Py_UNICODE *p,
8142 Py_ssize_t size,
8143 const char *errors)
8144{
Victor Stinner7581cef2011-11-03 22:32:33 +01008145 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008146 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01008147 if (unicode == NULL)
8148 return NULL;
8149 res = encode_code_page(CP_ACP, unicode, errors);
8150 Py_DECREF(unicode);
8151 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02008152}
8153
8154PyObject *
8155PyUnicode_EncodeCodePage(int code_page,
8156 PyObject *unicode,
8157 const char *errors)
8158{
Victor Stinner7581cef2011-11-03 22:32:33 +01008159 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008160}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00008161
Alexander Belopolsky40018472011-02-26 01:02:56 +00008162PyObject *
8163PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008164{
Victor Stinner7581cef2011-11-03 22:32:33 +01008165 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008166}
8167
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008168#undef NEED_RETRY
8169
Steve Dowercc16be82016-09-08 10:35:16 -07008170#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008171
Guido van Rossumd57fd912000-03-10 22:53:23 +00008172/* --- Character Mapping Codec -------------------------------------------- */
8173
Victor Stinnerfb161b12013-04-18 01:44:27 +02008174static int
8175charmap_decode_string(const char *s,
8176 Py_ssize_t size,
8177 PyObject *mapping,
8178 const char *errors,
8179 _PyUnicodeWriter *writer)
8180{
8181 const char *starts = s;
8182 const char *e;
8183 Py_ssize_t startinpos, endinpos;
8184 PyObject *errorHandler = NULL, *exc = NULL;
8185 Py_ssize_t maplen;
8186 enum PyUnicode_Kind mapkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008187 const void *mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008188 Py_UCS4 x;
8189 unsigned char ch;
8190
8191 if (PyUnicode_READY(mapping) == -1)
8192 return -1;
8193
8194 maplen = PyUnicode_GET_LENGTH(mapping);
8195 mapdata = PyUnicode_DATA(mapping);
8196 mapkind = PyUnicode_KIND(mapping);
8197
8198 e = s + size;
8199
8200 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8201 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8202 * is disabled in encoding aliases, latin1 is preferred because
8203 * its implementation is faster. */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008204 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008205 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8206 Py_UCS4 maxchar = writer->maxchar;
8207
8208 assert (writer->kind == PyUnicode_1BYTE_KIND);
8209 while (s < e) {
8210 ch = *s;
8211 x = mapdata_ucs1[ch];
8212 if (x > maxchar) {
8213 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8214 goto onError;
8215 maxchar = writer->maxchar;
8216 outdata = (Py_UCS1 *)writer->data;
8217 }
8218 outdata[writer->pos] = x;
8219 writer->pos++;
8220 ++s;
8221 }
8222 return 0;
8223 }
8224
8225 while (s < e) {
8226 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8227 enum PyUnicode_Kind outkind = writer->kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008228 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008229 if (outkind == PyUnicode_1BYTE_KIND) {
8230 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8231 Py_UCS4 maxchar = writer->maxchar;
8232 while (s < e) {
8233 ch = *s;
8234 x = mapdata_ucs2[ch];
8235 if (x > maxchar)
8236 goto Error;
8237 outdata[writer->pos] = x;
8238 writer->pos++;
8239 ++s;
8240 }
8241 break;
8242 }
8243 else if (outkind == PyUnicode_2BYTE_KIND) {
8244 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8245 while (s < e) {
8246 ch = *s;
8247 x = mapdata_ucs2[ch];
8248 if (x == 0xFFFE)
8249 goto Error;
8250 outdata[writer->pos] = x;
8251 writer->pos++;
8252 ++s;
8253 }
8254 break;
8255 }
8256 }
8257 ch = *s;
8258
8259 if (ch < maplen)
8260 x = PyUnicode_READ(mapkind, mapdata, ch);
8261 else
8262 x = 0xfffe; /* invalid value */
8263Error:
8264 if (x == 0xfffe)
8265 {
8266 /* undefined mapping */
8267 startinpos = s-starts;
8268 endinpos = startinpos+1;
8269 if (unicode_decode_call_errorhandler_writer(
8270 errors, &errorHandler,
8271 "charmap", "character maps to <undefined>",
8272 &starts, &e, &startinpos, &endinpos, &exc, &s,
8273 writer)) {
8274 goto onError;
8275 }
8276 continue;
8277 }
8278
8279 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8280 goto onError;
8281 ++s;
8282 }
8283 Py_XDECREF(errorHandler);
8284 Py_XDECREF(exc);
8285 return 0;
8286
8287onError:
8288 Py_XDECREF(errorHandler);
8289 Py_XDECREF(exc);
8290 return -1;
8291}
8292
8293static int
8294charmap_decode_mapping(const char *s,
8295 Py_ssize_t size,
8296 PyObject *mapping,
8297 const char *errors,
8298 _PyUnicodeWriter *writer)
8299{
8300 const char *starts = s;
8301 const char *e;
8302 Py_ssize_t startinpos, endinpos;
8303 PyObject *errorHandler = NULL, *exc = NULL;
8304 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008305 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008306
8307 e = s + size;
8308
8309 while (s < e) {
8310 ch = *s;
8311
8312 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8313 key = PyLong_FromLong((long)ch);
8314 if (key == NULL)
8315 goto onError;
8316
8317 item = PyObject_GetItem(mapping, key);
8318 Py_DECREF(key);
8319 if (item == NULL) {
8320 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8321 /* No mapping found means: mapping is undefined. */
8322 PyErr_Clear();
8323 goto Undefined;
8324 } else
8325 goto onError;
8326 }
8327
8328 /* Apply mapping */
8329 if (item == Py_None)
8330 goto Undefined;
8331 if (PyLong_Check(item)) {
8332 long value = PyLong_AS_LONG(item);
8333 if (value == 0xFFFE)
8334 goto Undefined;
8335 if (value < 0 || value > MAX_UNICODE) {
8336 PyErr_Format(PyExc_TypeError,
Max Bernstein36353882020-10-17 13:38:21 -07008337 "character mapping must be in range(0x%x)",
Victor Stinnerfb161b12013-04-18 01:44:27 +02008338 (unsigned long)MAX_UNICODE + 1);
8339 goto onError;
8340 }
8341
8342 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8343 goto onError;
8344 }
8345 else if (PyUnicode_Check(item)) {
8346 if (PyUnicode_READY(item) == -1)
8347 goto onError;
8348 if (PyUnicode_GET_LENGTH(item) == 1) {
8349 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8350 if (value == 0xFFFE)
8351 goto Undefined;
8352 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8353 goto onError;
8354 }
8355 else {
8356 writer->overallocate = 1;
8357 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8358 goto onError;
8359 }
8360 }
8361 else {
8362 /* wrong return value */
8363 PyErr_SetString(PyExc_TypeError,
8364 "character mapping must return integer, None or str");
8365 goto onError;
8366 }
8367 Py_CLEAR(item);
8368 ++s;
8369 continue;
8370
8371Undefined:
8372 /* undefined mapping */
8373 Py_CLEAR(item);
8374 startinpos = s-starts;
8375 endinpos = startinpos+1;
8376 if (unicode_decode_call_errorhandler_writer(
8377 errors, &errorHandler,
8378 "charmap", "character maps to <undefined>",
8379 &starts, &e, &startinpos, &endinpos, &exc, &s,
8380 writer)) {
8381 goto onError;
8382 }
8383 }
8384 Py_XDECREF(errorHandler);
8385 Py_XDECREF(exc);
8386 return 0;
8387
8388onError:
8389 Py_XDECREF(item);
8390 Py_XDECREF(errorHandler);
8391 Py_XDECREF(exc);
8392 return -1;
8393}
8394
Alexander Belopolsky40018472011-02-26 01:02:56 +00008395PyObject *
8396PyUnicode_DecodeCharmap(const char *s,
8397 Py_ssize_t size,
8398 PyObject *mapping,
8399 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008400{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008401 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008402
Guido van Rossumd57fd912000-03-10 22:53:23 +00008403 /* Default to Latin-1 */
8404 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008405 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008406
Guido van Rossumd57fd912000-03-10 22:53:23 +00008407 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008408 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008409 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008410 writer.min_length = size;
8411 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008412 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008413
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008414 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008415 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8416 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008417 }
8418 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008419 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8420 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008421 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008422 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008423
Benjamin Peterson29060642009-01-31 22:14:21 +00008424 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008425 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008426 return NULL;
8427}
8428
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008429/* Charmap encoding: the lookup table */
8430
Alexander Belopolsky40018472011-02-26 01:02:56 +00008431struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008432 PyObject_HEAD
8433 unsigned char level1[32];
8434 int count2, count3;
8435 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008436};
8437
8438static PyObject*
8439encoding_map_size(PyObject *obj, PyObject* args)
8440{
8441 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008442 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008443 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008444}
8445
8446static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008447 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 PyDoc_STR("Return the size (in bytes) of this object") },
8449 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008450};
8451
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008452static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008453 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 "EncodingMap", /*tp_name*/
8455 sizeof(struct encoding_map), /*tp_basicsize*/
8456 0, /*tp_itemsize*/
8457 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008458 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008459 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008460 0, /*tp_getattr*/
8461 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008462 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 0, /*tp_repr*/
8464 0, /*tp_as_number*/
8465 0, /*tp_as_sequence*/
8466 0, /*tp_as_mapping*/
8467 0, /*tp_hash*/
8468 0, /*tp_call*/
8469 0, /*tp_str*/
8470 0, /*tp_getattro*/
8471 0, /*tp_setattro*/
8472 0, /*tp_as_buffer*/
8473 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8474 0, /*tp_doc*/
8475 0, /*tp_traverse*/
8476 0, /*tp_clear*/
8477 0, /*tp_richcompare*/
8478 0, /*tp_weaklistoffset*/
8479 0, /*tp_iter*/
8480 0, /*tp_iternext*/
8481 encoding_map_methods, /*tp_methods*/
8482 0, /*tp_members*/
8483 0, /*tp_getset*/
8484 0, /*tp_base*/
8485 0, /*tp_dict*/
8486 0, /*tp_descr_get*/
8487 0, /*tp_descr_set*/
8488 0, /*tp_dictoffset*/
8489 0, /*tp_init*/
8490 0, /*tp_alloc*/
8491 0, /*tp_new*/
8492 0, /*tp_free*/
8493 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008494};
8495
8496PyObject*
8497PyUnicode_BuildEncodingMap(PyObject* string)
8498{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008499 PyObject *result;
8500 struct encoding_map *mresult;
8501 int i;
8502 int need_dict = 0;
8503 unsigned char level1[32];
8504 unsigned char level2[512];
8505 unsigned char *mlevel1, *mlevel2, *mlevel3;
8506 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008507 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008508 const void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008509 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008510 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008511
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008512 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008513 PyErr_BadArgument();
8514 return NULL;
8515 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008516 kind = PyUnicode_KIND(string);
8517 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008518 length = PyUnicode_GET_LENGTH(string);
8519 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008520 memset(level1, 0xFF, sizeof level1);
8521 memset(level2, 0xFF, sizeof level2);
8522
8523 /* If there isn't a one-to-one mapping of NULL to \0,
8524 or if there are non-BMP characters, we need to use
8525 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008526 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008527 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008528 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008529 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008530 ch = PyUnicode_READ(kind, data, i);
8531 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008532 need_dict = 1;
8533 break;
8534 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008535 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008536 /* unmapped character */
8537 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008538 l1 = ch >> 11;
8539 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008540 if (level1[l1] == 0xFF)
8541 level1[l1] = count2++;
8542 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008543 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008544 }
8545
8546 if (count2 >= 0xFF || count3 >= 0xFF)
8547 need_dict = 1;
8548
8549 if (need_dict) {
8550 PyObject *result = PyDict_New();
8551 PyObject *key, *value;
8552 if (!result)
8553 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008554 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008555 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008556 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008557 if (!key || !value)
8558 goto failed1;
8559 if (PyDict_SetItem(result, key, value) == -1)
8560 goto failed1;
8561 Py_DECREF(key);
8562 Py_DECREF(value);
8563 }
8564 return result;
8565 failed1:
8566 Py_XDECREF(key);
8567 Py_XDECREF(value);
8568 Py_DECREF(result);
8569 return NULL;
8570 }
8571
8572 /* Create a three-level trie */
Victor Stinner32bd68c2020-12-01 10:37:39 +01008573 result = PyObject_Malloc(sizeof(struct encoding_map) +
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008574 16*count2 + 128*count3 - 1);
Victor Stinner04fc4f22020-06-16 01:28:07 +02008575 if (!result) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008576 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02008577 }
8578
8579 _PyObject_Init(result, &EncodingMapType);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008580 mresult = (struct encoding_map*)result;
8581 mresult->count2 = count2;
8582 mresult->count3 = count3;
8583 mlevel1 = mresult->level1;
8584 mlevel2 = mresult->level23;
8585 mlevel3 = mresult->level23 + 16*count2;
8586 memcpy(mlevel1, level1, 32);
8587 memset(mlevel2, 0xFF, 16*count2);
8588 memset(mlevel3, 0, 128*count3);
8589 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008590 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008591 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008592 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8593 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008594 /* unmapped character */
8595 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008596 o1 = ch>>11;
8597 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008598 i2 = 16*mlevel1[o1] + o2;
8599 if (mlevel2[i2] == 0xFF)
8600 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008601 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008602 i3 = 128*mlevel2[i2] + o3;
8603 mlevel3[i3] = i;
8604 }
8605 return result;
8606}
8607
8608static int
Victor Stinner22168992011-11-20 17:09:18 +01008609encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008610{
8611 struct encoding_map *map = (struct encoding_map*)mapping;
8612 int l1 = c>>11;
8613 int l2 = (c>>7) & 0xF;
8614 int l3 = c & 0x7F;
8615 int i;
8616
Victor Stinner22168992011-11-20 17:09:18 +01008617 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008618 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008619 if (c == 0)
8620 return 0;
8621 /* level 1*/
8622 i = map->level1[l1];
8623 if (i == 0xFF) {
8624 return -1;
8625 }
8626 /* level 2*/
8627 i = map->level23[16*i+l2];
8628 if (i == 0xFF) {
8629 return -1;
8630 }
8631 /* level 3 */
8632 i = map->level23[16*map->count2 + 128*i + l3];
8633 if (i == 0) {
8634 return -1;
8635 }
8636 return i;
8637}
8638
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008639/* Lookup the character ch in the mapping. If the character
8640 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008641 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008642static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008643charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008644{
Christian Heimes217cfd12007-12-02 14:31:20 +00008645 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008646 PyObject *x;
8647
8648 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008650 x = PyObject_GetItem(mapping, w);
8651 Py_DECREF(w);
8652 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008653 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8654 /* No mapping found means: mapping is undefined. */
8655 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008656 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008657 } else
8658 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008660 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008661 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008662 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008663 long value = PyLong_AS_LONG(x);
8664 if (value < 0 || value > 255) {
8665 PyErr_SetString(PyExc_TypeError,
8666 "character mapping must be in range(256)");
8667 Py_DECREF(x);
8668 return NULL;
8669 }
8670 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008671 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008672 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008674 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008675 /* wrong return value */
8676 PyErr_Format(PyExc_TypeError,
8677 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008678 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008679 Py_DECREF(x);
8680 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008681 }
8682}
8683
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008684static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008685charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008686{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008687 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8688 /* exponentially overallocate to minimize reallocations */
8689 if (requiredsize < 2*outsize)
8690 requiredsize = 2*outsize;
8691 if (_PyBytes_Resize(outobj, requiredsize))
8692 return -1;
8693 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008694}
8695
Benjamin Peterson14339b62009-01-31 16:36:08 +00008696typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008697 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008698} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008699/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008700 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008701 space is available. Return a new reference to the object that
8702 was put in the output buffer, or Py_None, if the mapping was undefined
8703 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008704 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008705static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008706charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008707 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008708{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008709 PyObject *rep;
8710 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008711 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008712
Andy Lesterdffe4c02020-03-04 07:15:20 -06008713 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008714 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008715 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008716 if (res == -1)
8717 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 if (outsize<requiredsize)
8719 if (charmapencode_resize(outobj, outpos, requiredsize))
8720 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008721 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 outstart[(*outpos)++] = (char)res;
8723 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008724 }
8725
8726 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008727 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008728 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008729 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008730 Py_DECREF(rep);
8731 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008732 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008733 if (PyLong_Check(rep)) {
8734 Py_ssize_t requiredsize = *outpos+1;
8735 if (outsize<requiredsize)
8736 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8737 Py_DECREF(rep);
8738 return enc_EXCEPTION;
8739 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008740 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008741 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008742 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008743 else {
8744 const char *repchars = PyBytes_AS_STRING(rep);
8745 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8746 Py_ssize_t requiredsize = *outpos+repsize;
8747 if (outsize<requiredsize)
8748 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8749 Py_DECREF(rep);
8750 return enc_EXCEPTION;
8751 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008752 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008753 memcpy(outstart + *outpos, repchars, repsize);
8754 *outpos += repsize;
8755 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008756 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008757 Py_DECREF(rep);
8758 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008759}
8760
8761/* handle an error in PyUnicode_EncodeCharmap
8762 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008763static int
8764charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008765 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008766 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008767 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008768 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008769{
8770 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008771 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008772 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008773 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008774 const void *data;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008775 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008776 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008777 Py_ssize_t collstartpos = *inpos;
8778 Py_ssize_t collendpos = *inpos+1;
8779 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008780 const char *encoding = "charmap";
8781 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008782 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008783 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008784 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008785
Benjamin Petersonbac79492012-01-14 13:34:47 -05008786 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008787 return -1;
8788 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008789 /* find all unencodable characters */
8790 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008791 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008792 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008793 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008794 val = encoding_map_lookup(ch, mapping);
8795 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008796 break;
8797 ++collendpos;
8798 continue;
8799 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008800
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008801 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8802 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008803 if (rep==NULL)
8804 return -1;
8805 else if (rep!=Py_None) {
8806 Py_DECREF(rep);
8807 break;
8808 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008809 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008810 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008811 }
8812 /* cache callback name lookup
8813 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008814 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008815 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008816
8817 switch (*error_handler) {
8818 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008819 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008820 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008821
8822 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008823 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008824 x = charmapencode_output('?', mapping, res, respos);
8825 if (x==enc_EXCEPTION) {
8826 return -1;
8827 }
8828 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008829 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008830 return -1;
8831 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008832 }
8833 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008834 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008835 *inpos = collendpos;
8836 break;
Victor Stinner50149202015-09-22 00:26:54 +02008837
8838 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008839 /* generate replacement (temporarily (mis)uses p) */
8840 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008841 char buffer[2+29+1+1];
8842 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008843 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008844 for (cp = buffer; *cp; ++cp) {
8845 x = charmapencode_output(*cp, mapping, res, respos);
8846 if (x==enc_EXCEPTION)
8847 return -1;
8848 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008849 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008850 return -1;
8851 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008852 }
8853 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008854 *inpos = collendpos;
8855 break;
Victor Stinner50149202015-09-22 00:26:54 +02008856
Benjamin Peterson14339b62009-01-31 16:36:08 +00008857 default:
Victor Stinner50149202015-09-22 00:26:54 +02008858 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008859 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008860 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008861 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008862 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008863 if (PyBytes_Check(repunicode)) {
8864 /* Directly copy bytes result to output. */
8865 Py_ssize_t outsize = PyBytes_Size(*res);
8866 Py_ssize_t requiredsize;
8867 repsize = PyBytes_Size(repunicode);
8868 requiredsize = *respos + repsize;
8869 if (requiredsize > outsize)
8870 /* Make room for all additional bytes. */
8871 if (charmapencode_resize(res, respos, requiredsize)) {
8872 Py_DECREF(repunicode);
8873 return -1;
8874 }
8875 memcpy(PyBytes_AsString(*res) + *respos,
8876 PyBytes_AsString(repunicode), repsize);
8877 *respos += repsize;
8878 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008879 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008880 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008881 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008882 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008883 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008884 Py_DECREF(repunicode);
8885 return -1;
8886 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008887 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008888 data = PyUnicode_DATA(repunicode);
8889 kind = PyUnicode_KIND(repunicode);
8890 for (index = 0; index < repsize; index++) {
8891 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8892 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008893 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008894 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008895 return -1;
8896 }
8897 else if (x==enc_FAILED) {
8898 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008899 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008900 return -1;
8901 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008902 }
8903 *inpos = newpos;
8904 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008905 }
8906 return 0;
8907}
8908
Alexander Belopolsky40018472011-02-26 01:02:56 +00008909PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008910_PyUnicode_EncodeCharmap(PyObject *unicode,
8911 PyObject *mapping,
8912 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008913{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008914 /* output object */
8915 PyObject *res = NULL;
8916 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008917 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008918 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008919 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008920 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008921 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008922 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008923 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008924 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008925 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008926
Benjamin Petersonbac79492012-01-14 13:34:47 -05008927 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008928 return NULL;
8929 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008930 data = PyUnicode_DATA(unicode);
8931 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008932
Guido van Rossumd57fd912000-03-10 22:53:23 +00008933 /* Default to Latin-1 */
8934 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008935 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008936
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008937 /* allocate enough for a simple encoding without
8938 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008939 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008940 if (res == NULL)
8941 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008942 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008943 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008944
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008945 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008946 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008947 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008948 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008949 if (x==enc_EXCEPTION) /* error */
8950 goto onError;
8951 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008952 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008953 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008954 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008955 &res, &respos)) {
8956 goto onError;
8957 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008958 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008959 else
8960 /* done with this character => adjust input position */
8961 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008962 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008963
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008964 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008965 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008966 if (_PyBytes_Resize(&res, respos) < 0)
8967 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008968
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008969 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008970 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008971 return res;
8972
Benjamin Peterson29060642009-01-31 22:14:21 +00008973 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008974 Py_XDECREF(res);
8975 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008976 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008977 return NULL;
8978}
8979
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008980/* Deprecated */
8981PyObject *
8982PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8983 Py_ssize_t size,
8984 PyObject *mapping,
8985 const char *errors)
8986{
8987 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008988 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008989 if (unicode == NULL)
8990 return NULL;
8991 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8992 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008993 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008994}
8995
Alexander Belopolsky40018472011-02-26 01:02:56 +00008996PyObject *
8997PyUnicode_AsCharmapString(PyObject *unicode,
8998 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008999{
9000 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009001 PyErr_BadArgument();
9002 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009003 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009004 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009005}
9006
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009007/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009008static void
9009make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009010 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009011 Py_ssize_t startpos, Py_ssize_t endpos,
9012 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009013{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009014 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009015 *exceptionObject = _PyUnicodeTranslateError_Create(
9016 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009017 }
9018 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009019 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9020 goto onError;
9021 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9022 goto onError;
9023 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9024 goto onError;
9025 return;
9026 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02009027 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009028 }
9029}
9030
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009031/* error handling callback helper:
9032 build arguments, call the callback and check the arguments,
9033 put the result into newpos and return the replacement string, which
9034 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009035static PyObject *
9036unicode_translate_call_errorhandler(const char *errors,
9037 PyObject **errorHandler,
9038 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009039 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009040 Py_ssize_t startpos, Py_ssize_t endpos,
9041 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009042{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009043 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009044
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009045 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009046 PyObject *restuple;
9047 PyObject *resunicode;
9048
9049 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009050 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009051 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009052 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009053 }
9054
9055 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009056 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009057 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009058 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009059
Petr Viktorinffd97532020-02-11 17:46:57 +01009060 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009061 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009062 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009063 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009064 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00009065 Py_DECREF(restuple);
9066 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009067 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009068 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00009069 &resunicode, &i_newpos)) {
9070 Py_DECREF(restuple);
9071 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009072 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00009073 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009074 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009075 else
9076 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009077 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02009078 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00009079 Py_DECREF(restuple);
9080 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00009081 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009082 Py_INCREF(resunicode);
9083 Py_DECREF(restuple);
9084 return resunicode;
9085}
9086
9087/* Lookup the character ch in the mapping and put the result in result,
9088 which must be decrefed by the caller.
9089 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009090static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009091charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009092{
Christian Heimes217cfd12007-12-02 14:31:20 +00009093 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009094 PyObject *x;
9095
9096 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009097 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009098 x = PyObject_GetItem(mapping, w);
9099 Py_DECREF(w);
9100 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009101 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9102 /* No mapping found means: use 1:1 mapping. */
9103 PyErr_Clear();
9104 *result = NULL;
9105 return 0;
9106 } else
9107 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009108 }
9109 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009110 *result = x;
9111 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009112 }
Christian Heimes217cfd12007-12-02 14:31:20 +00009113 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009114 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009115 if (value < 0 || value > MAX_UNICODE) {
9116 PyErr_Format(PyExc_ValueError,
9117 "character mapping must be in range(0x%x)",
9118 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00009119 Py_DECREF(x);
9120 return -1;
9121 }
9122 *result = x;
9123 return 0;
9124 }
9125 else if (PyUnicode_Check(x)) {
9126 *result = x;
9127 return 0;
9128 }
9129 else {
9130 /* wrong return value */
9131 PyErr_SetString(PyExc_TypeError,
9132 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009133 Py_DECREF(x);
9134 return -1;
9135 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009136}
Victor Stinner1194ea02014-04-04 19:37:40 +02009137
9138/* lookup the character, write the result into the writer.
9139 Return 1 if the result was written into the writer, return 0 if the mapping
9140 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009141static int
Victor Stinner1194ea02014-04-04 19:37:40 +02009142charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9143 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009144{
Victor Stinner1194ea02014-04-04 19:37:40 +02009145 PyObject *item;
9146
9147 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00009148 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009149
9150 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009151 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02009152 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009153 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009154 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009155 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009156 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009157
9158 if (item == Py_None) {
9159 Py_DECREF(item);
9160 return 0;
9161 }
9162
9163 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02009164 long ch = (Py_UCS4)PyLong_AS_LONG(item);
9165 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9166 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009167 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9168 Py_DECREF(item);
9169 return -1;
9170 }
9171 Py_DECREF(item);
9172 return 1;
9173 }
9174
9175 if (!PyUnicode_Check(item)) {
9176 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00009177 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009178 }
9179
9180 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9181 Py_DECREF(item);
9182 return -1;
9183 }
9184
9185 Py_DECREF(item);
9186 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009187}
9188
Victor Stinner89a76ab2014-04-05 11:44:04 +02009189static int
9190unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9191 Py_UCS1 *translate)
9192{
Benjamin Peterson1365de72014-04-07 20:15:41 -04009193 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009194 int ret = 0;
9195
Victor Stinner89a76ab2014-04-05 11:44:04 +02009196 if (charmaptranslate_lookup(ch, mapping, &item)) {
9197 return -1;
9198 }
9199
9200 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009201 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02009202 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009203 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009204 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009205 /* not found => default to 1:1 mapping */
9206 translate[ch] = ch;
9207 return 1;
9208 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009209 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02009210 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009211 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9212 used it */
9213 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009214 /* invalid character or character outside ASCII:
9215 skip the fast translate */
9216 goto exit;
9217 }
9218 translate[ch] = (Py_UCS1)replace;
9219 }
9220 else if (PyUnicode_Check(item)) {
9221 Py_UCS4 replace;
9222
9223 if (PyUnicode_READY(item) == -1) {
9224 Py_DECREF(item);
9225 return -1;
9226 }
9227 if (PyUnicode_GET_LENGTH(item) != 1)
9228 goto exit;
9229
9230 replace = PyUnicode_READ_CHAR(item, 0);
9231 if (replace > 127)
9232 goto exit;
9233 translate[ch] = (Py_UCS1)replace;
9234 }
9235 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009236 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009237 goto exit;
9238 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009239 ret = 1;
9240
Benjamin Peterson1365de72014-04-07 20:15:41 -04009241 exit:
9242 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009243 return ret;
9244}
9245
9246/* Fast path for ascii => ascii translation. Return 1 if the whole string
9247 was translated into writer, return 0 if the input string was partially
9248 translated into writer, raise an exception and return -1 on error. */
9249static int
9250unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009251 _PyUnicodeWriter *writer, int ignore,
9252 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009253{
Victor Stinner872b2912014-04-05 14:27:07 +02009254 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009255 Py_ssize_t len;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009256 const Py_UCS1 *in, *end;
9257 Py_UCS1 *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009258 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009259
Victor Stinner89a76ab2014-04-05 11:44:04 +02009260 len = PyUnicode_GET_LENGTH(input);
9261
Victor Stinner872b2912014-04-05 14:27:07 +02009262 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009263
9264 in = PyUnicode_1BYTE_DATA(input);
9265 end = in + len;
9266
9267 assert(PyUnicode_IS_ASCII(writer->buffer));
9268 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9269 out = PyUnicode_1BYTE_DATA(writer->buffer);
9270
Victor Stinner872b2912014-04-05 14:27:07 +02009271 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009272 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009273 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009274 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009275 int translate = unicode_fast_translate_lookup(mapping, ch,
9276 ascii_table);
9277 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009278 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009279 if (translate == 0)
9280 goto exit;
9281 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009282 }
Victor Stinner872b2912014-04-05 14:27:07 +02009283 if (ch2 == 0xfe) {
9284 if (ignore)
9285 continue;
9286 goto exit;
9287 }
9288 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009289 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009290 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009291 }
Victor Stinner872b2912014-04-05 14:27:07 +02009292 res = 1;
9293
9294exit:
9295 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009296 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009297 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009298}
9299
Victor Stinner3222da22015-10-01 22:07:32 +02009300static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009301_PyUnicode_TranslateCharmap(PyObject *input,
9302 PyObject *mapping,
9303 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009304{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009305 /* input object */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009306 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009307 Py_ssize_t size, i;
9308 int kind;
9309 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009310 _PyUnicodeWriter writer;
9311 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009312 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009313 PyObject *errorHandler = NULL;
9314 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009315 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009316 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009317
Guido van Rossumd57fd912000-03-10 22:53:23 +00009318 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009319 PyErr_BadArgument();
9320 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009321 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009322
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009323 if (PyUnicode_READY(input) == -1)
9324 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009325 data = PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009326 kind = PyUnicode_KIND(input);
9327 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009328
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009329 if (size == 0)
9330 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009331
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009332 /* allocate enough for a simple 1:1 translation without
9333 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009334 _PyUnicodeWriter_Init(&writer);
9335 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009336 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009337
Victor Stinner872b2912014-04-05 14:27:07 +02009338 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9339
Victor Stinner33798672016-03-01 21:59:58 +01009340 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009341 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009342 if (PyUnicode_IS_ASCII(input)) {
9343 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9344 if (res < 0) {
9345 _PyUnicodeWriter_Dealloc(&writer);
9346 return NULL;
9347 }
9348 if (res == 1)
9349 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009350 }
Victor Stinner33798672016-03-01 21:59:58 +01009351 else {
9352 i = 0;
9353 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009354
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009355 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009356 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009357 int translate;
9358 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9359 Py_ssize_t newpos;
9360 /* startpos for collecting untranslatable chars */
9361 Py_ssize_t collstart;
9362 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009363 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009364
Victor Stinner1194ea02014-04-04 19:37:40 +02009365 ch = PyUnicode_READ(kind, data, i);
9366 translate = charmaptranslate_output(ch, mapping, &writer);
9367 if (translate < 0)
9368 goto onError;
9369
9370 if (translate != 0) {
9371 /* it worked => adjust input pointer */
9372 ++i;
9373 continue;
9374 }
9375
9376 /* untranslatable character */
9377 collstart = i;
9378 collend = i+1;
9379
9380 /* find all untranslatable characters */
9381 while (collend < size) {
9382 PyObject *x;
9383 ch = PyUnicode_READ(kind, data, collend);
9384 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009385 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009386 Py_XDECREF(x);
9387 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009388 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009389 ++collend;
9390 }
9391
9392 if (ignore) {
9393 i = collend;
9394 }
9395 else {
9396 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9397 reason, input, &exc,
9398 collstart, collend, &newpos);
9399 if (repunicode == NULL)
9400 goto onError;
9401 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009402 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009403 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009404 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009405 Py_DECREF(repunicode);
9406 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009407 }
9408 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009409 Py_XDECREF(exc);
9410 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009411 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009412
Benjamin Peterson29060642009-01-31 22:14:21 +00009413 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009414 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009415 Py_XDECREF(exc);
9416 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009417 return NULL;
9418}
9419
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009420/* Deprecated. Use PyUnicode_Translate instead. */
9421PyObject *
9422PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9423 Py_ssize_t size,
9424 PyObject *mapping,
9425 const char *errors)
9426{
Christian Heimes5f520f42012-09-11 14:03:25 +02009427 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009428 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009429 if (!unicode)
9430 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009431 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9432 Py_DECREF(unicode);
9433 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009434}
9435
Alexander Belopolsky40018472011-02-26 01:02:56 +00009436PyObject *
9437PyUnicode_Translate(PyObject *str,
9438 PyObject *mapping,
9439 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009440{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009441 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009442 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009443 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009444}
Tim Petersced69f82003-09-16 20:30:58 +00009445
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009446PyObject *
9447_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9448{
9449 if (!PyUnicode_Check(unicode)) {
9450 PyErr_BadInternalCall();
9451 return NULL;
9452 }
9453 if (PyUnicode_READY(unicode) == -1)
9454 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009455 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009456 /* If the string is already ASCII, just return the same string */
9457 Py_INCREF(unicode);
9458 return unicode;
9459 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009460
9461 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9462 PyObject *result = PyUnicode_New(len, 127);
9463 if (result == NULL) {
9464 return NULL;
9465 }
9466
9467 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9468 int kind = PyUnicode_KIND(unicode);
9469 const void *data = PyUnicode_DATA(unicode);
9470 Py_ssize_t i;
9471 for (i = 0; i < len; ++i) {
9472 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9473 if (ch < 127) {
9474 out[i] = ch;
9475 }
9476 else if (Py_UNICODE_ISSPACE(ch)) {
9477 out[i] = ' ';
9478 }
9479 else {
9480 int decimal = Py_UNICODE_TODECIMAL(ch);
9481 if (decimal < 0) {
9482 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009483 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009484 _PyUnicode_LENGTH(result) = i + 1;
9485 break;
9486 }
9487 out[i] = '0' + decimal;
9488 }
9489 }
9490
INADA Naoki16dfca42018-07-14 12:06:43 +09009491 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009492 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009493}
9494
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009495PyObject *
9496PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9497 Py_ssize_t length)
9498{
Victor Stinnerf0124502011-11-21 23:12:56 +01009499 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009500 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009501 Py_UCS4 maxchar;
9502 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009503 const void *data;
Victor Stinnerf0124502011-11-21 23:12:56 +01009504
Victor Stinner99d7ad02012-02-22 13:37:39 +01009505 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009506 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009507 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009508 if (ch > 127) {
9509 int decimal = Py_UNICODE_TODECIMAL(ch);
9510 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009511 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009512 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009513 }
9514 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009515
9516 /* Copy to a new string */
9517 decimal = PyUnicode_New(length, maxchar);
9518 if (decimal == NULL)
9519 return decimal;
9520 kind = PyUnicode_KIND(decimal);
9521 data = PyUnicode_DATA(decimal);
9522 /* Iterate over code points */
9523 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009524 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009525 if (ch > 127) {
9526 int decimal = Py_UNICODE_TODECIMAL(ch);
9527 if (decimal >= 0)
9528 ch = '0' + decimal;
9529 }
9530 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009531 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009532 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009533}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009534/* --- Decimal Encoder ---------------------------------------------------- */
9535
Alexander Belopolsky40018472011-02-26 01:02:56 +00009536int
9537PyUnicode_EncodeDecimal(Py_UNICODE *s,
9538 Py_ssize_t length,
9539 char *output,
9540 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009541{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009542 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009543 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009544 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009545 const void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009546
9547 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009548 PyErr_BadArgument();
9549 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009550 }
9551
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009552 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009553 if (unicode == NULL)
9554 return -1;
9555
Victor Stinner42bf7752011-11-21 22:52:58 +01009556 kind = PyUnicode_KIND(unicode);
9557 data = PyUnicode_DATA(unicode);
9558
Victor Stinnerb84d7232011-11-22 01:50:07 +01009559 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009560 PyObject *exc;
9561 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009562 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009563 Py_ssize_t startpos;
9564
9565 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009566
Benjamin Peterson29060642009-01-31 22:14:21 +00009567 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009568 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009569 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009570 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009571 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009572 decimal = Py_UNICODE_TODECIMAL(ch);
9573 if (decimal >= 0) {
9574 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009575 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009576 continue;
9577 }
9578 if (0 < ch && ch < 256) {
9579 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009580 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009581 continue;
9582 }
Victor Stinner6345be92011-11-25 20:09:01 +01009583
Victor Stinner42bf7752011-11-21 22:52:58 +01009584 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009585 exc = NULL;
9586 raise_encode_exception(&exc, "decimal", unicode,
9587 startpos, startpos+1,
9588 "invalid decimal Unicode string");
9589 Py_XDECREF(exc);
9590 Py_DECREF(unicode);
9591 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009592 }
9593 /* 0-terminate the output string */
9594 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009595 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009596 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009597}
9598
Guido van Rossumd57fd912000-03-10 22:53:23 +00009599/* --- Helpers ------------------------------------------------------------ */
9600
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009601/* helper macro to fixup start/end slice values */
9602#define ADJUST_INDICES(start, end, len) \
9603 if (end > len) \
9604 end = len; \
9605 else if (end < 0) { \
9606 end += len; \
9607 if (end < 0) \
9608 end = 0; \
9609 } \
9610 if (start < 0) { \
9611 start += len; \
9612 if (start < 0) \
9613 start = 0; \
9614 }
9615
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009616static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009617any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009618 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009619 Py_ssize_t end,
9620 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009621{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009622 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009623 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009624 Py_ssize_t len1, len2, result;
9625
9626 kind1 = PyUnicode_KIND(s1);
9627 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009628 if (kind1 < kind2)
9629 return -1;
9630
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009631 len1 = PyUnicode_GET_LENGTH(s1);
9632 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009633 ADJUST_INDICES(start, end, len1);
9634 if (end - start < len2)
9635 return -1;
9636
9637 buf1 = PyUnicode_DATA(s1);
9638 buf2 = PyUnicode_DATA(s2);
9639 if (len2 == 1) {
9640 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9641 result = findchar((const char *)buf1 + kind1*start,
9642 kind1, end - start, ch, direction);
9643 if (result == -1)
9644 return -1;
9645 else
9646 return start + result;
9647 }
9648
9649 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009650 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009651 if (!buf2)
9652 return -2;
9653 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009654
Victor Stinner794d5672011-10-10 03:21:36 +02009655 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009656 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009657 case PyUnicode_1BYTE_KIND:
9658 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9659 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9660 else
9661 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9662 break;
9663 case PyUnicode_2BYTE_KIND:
9664 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9665 break;
9666 case PyUnicode_4BYTE_KIND:
9667 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9668 break;
9669 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009670 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009671 }
9672 }
9673 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009674 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009675 case PyUnicode_1BYTE_KIND:
9676 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9677 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9678 else
9679 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9680 break;
9681 case PyUnicode_2BYTE_KIND:
9682 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9683 break;
9684 case PyUnicode_4BYTE_KIND:
9685 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9686 break;
9687 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009688 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009689 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009690 }
9691
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009692 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009693 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009694 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009695
9696 return result;
9697}
9698
Victor Stinner59423e32018-11-26 13:40:01 +01009699/* _PyUnicode_InsertThousandsGrouping() helper functions */
9700#include "stringlib/localeutil.h"
9701
9702/**
9703 * InsertThousandsGrouping:
9704 * @writer: Unicode writer.
9705 * @n_buffer: Number of characters in @buffer.
9706 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9707 * @d_pos: Start of digits string.
9708 * @n_digits: The number of digits in the string, in which we want
9709 * to put the grouping chars.
9710 * @min_width: The minimum width of the digits in the output string.
9711 * Output will be zero-padded on the left to fill.
9712 * @grouping: see definition in localeconv().
9713 * @thousands_sep: see definition in localeconv().
9714 *
9715 * There are 2 modes: counting and filling. If @writer is NULL,
9716 * we are in counting mode, else filling mode.
9717 * If counting, the required buffer size is returned.
9718 * If filling, we know the buffer will be large enough, so we don't
9719 * need to pass in the buffer size.
9720 * Inserts thousand grouping characters (as defined by grouping and
9721 * thousands_sep) into @writer.
9722 *
9723 * Return value: -1 on error, number of characters otherwise.
9724 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009725Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009726_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009727 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009728 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009729 PyObject *digits,
9730 Py_ssize_t d_pos,
9731 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009732 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009733 const char *grouping,
9734 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009735 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009736{
Xtreak3f7983a2019-01-07 20:39:14 +05309737 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009738 if (writer) {
9739 assert(digits != NULL);
9740 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009741 }
9742 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009743 assert(digits == NULL);
9744 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009745 }
Victor Stinner59423e32018-11-26 13:40:01 +01009746 assert(0 <= d_pos);
9747 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009748 assert(grouping != NULL);
9749
9750 if (digits != NULL) {
9751 if (PyUnicode_READY(digits) == -1) {
9752 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009753 }
Victor Stinner59423e32018-11-26 13:40:01 +01009754 }
9755 if (PyUnicode_READY(thousands_sep) == -1) {
9756 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009757 }
9758
Victor Stinner59423e32018-11-26 13:40:01 +01009759 Py_ssize_t count = 0;
9760 Py_ssize_t n_zeros;
9761 int loop_broken = 0;
9762 int use_separator = 0; /* First time through, don't append the
9763 separator. They only go between
9764 groups. */
9765 Py_ssize_t buffer_pos;
9766 Py_ssize_t digits_pos;
9767 Py_ssize_t len;
9768 Py_ssize_t n_chars;
9769 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9770 be looked at */
9771 /* A generator that returns all of the grouping widths, until it
9772 returns 0. */
9773 GroupGenerator groupgen;
9774 GroupGenerator_init(&groupgen, grouping);
9775 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9776
9777 /* if digits are not grouped, thousands separator
9778 should be an empty string */
9779 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9780
9781 digits_pos = d_pos + n_digits;
9782 if (writer) {
9783 buffer_pos = writer->pos + n_buffer;
9784 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9785 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009786 }
Victor Stinner59423e32018-11-26 13:40:01 +01009787 else {
9788 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009789 }
Victor Stinner59423e32018-11-26 13:40:01 +01009790
9791 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009792 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009793 }
Victor Stinner59423e32018-11-26 13:40:01 +01009794
9795 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9796 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9797 n_zeros = Py_MAX(0, len - remaining);
9798 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9799
9800 /* Use n_zero zero's and n_chars chars */
9801
9802 /* Count only, don't do anything. */
9803 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9804
9805 /* Copy into the writer. */
9806 InsertThousandsGrouping_fill(writer, &buffer_pos,
9807 digits, &digits_pos,
9808 n_chars, n_zeros,
9809 use_separator ? thousands_sep : NULL,
9810 thousands_sep_len, maxchar);
9811
9812 /* Use a separator next time. */
9813 use_separator = 1;
9814
9815 remaining -= n_chars;
9816 min_width -= len;
9817
9818 if (remaining <= 0 && min_width <= 0) {
9819 loop_broken = 1;
9820 break;
9821 }
9822 min_width -= thousands_sep_len;
9823 }
9824 if (!loop_broken) {
9825 /* We left the loop without using a break statement. */
9826
9827 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9828 n_zeros = Py_MAX(0, len - remaining);
9829 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9830
9831 /* Use n_zero zero's and n_chars chars */
9832 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9833
9834 /* Copy into the writer. */
9835 InsertThousandsGrouping_fill(writer, &buffer_pos,
9836 digits, &digits_pos,
9837 n_chars, n_zeros,
9838 use_separator ? thousands_sep : NULL,
9839 thousands_sep_len, maxchar);
9840 }
9841 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009842}
9843
9844
Alexander Belopolsky40018472011-02-26 01:02:56 +00009845Py_ssize_t
9846PyUnicode_Count(PyObject *str,
9847 PyObject *substr,
9848 Py_ssize_t start,
9849 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009850{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009851 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009852 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009853 const void *buf1 = NULL, *buf2 = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009854 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009855
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009856 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009857 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009858
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009859 kind1 = PyUnicode_KIND(str);
9860 kind2 = PyUnicode_KIND(substr);
9861 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009862 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009863
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009864 len1 = PyUnicode_GET_LENGTH(str);
9865 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009867 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009868 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009869
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009870 buf1 = PyUnicode_DATA(str);
9871 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009872 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009873 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009874 if (!buf2)
9875 goto onError;
9876 }
9877
9878 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009879 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009880 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009881 result = asciilib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009882 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009883 buf2, len2, PY_SSIZE_T_MAX
9884 );
9885 else
9886 result = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009887 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009888 buf2, len2, PY_SSIZE_T_MAX
9889 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009890 break;
9891 case PyUnicode_2BYTE_KIND:
9892 result = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009893 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009894 buf2, len2, PY_SSIZE_T_MAX
9895 );
9896 break;
9897 case PyUnicode_4BYTE_KIND:
9898 result = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009899 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009900 buf2, len2, PY_SSIZE_T_MAX
9901 );
9902 break;
9903 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009904 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009905 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009906
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009907 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009908 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009909 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009910
Guido van Rossumd57fd912000-03-10 22:53:23 +00009911 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009912 onError:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009913 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9914 if (kind2 != kind1)
9915 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009916 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009917}
9918
Alexander Belopolsky40018472011-02-26 01:02:56 +00009919Py_ssize_t
9920PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009921 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009922 Py_ssize_t start,
9923 Py_ssize_t end,
9924 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009925{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009926 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009927 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009928
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009929 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009930}
9931
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009932Py_ssize_t
9933PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9934 Py_ssize_t start, Py_ssize_t end,
9935 int direction)
9936{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009937 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009938 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009939 if (PyUnicode_READY(str) == -1)
9940 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009941 len = PyUnicode_GET_LENGTH(str);
9942 ADJUST_INDICES(start, end, len);
9943 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009944 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009945 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009946 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9947 kind, end-start, ch, direction);
9948 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009949 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009950 else
9951 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009952}
9953
Alexander Belopolsky40018472011-02-26 01:02:56 +00009954static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009955tailmatch(PyObject *self,
9956 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009957 Py_ssize_t start,
9958 Py_ssize_t end,
9959 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009960{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009961 int kind_self;
9962 int kind_sub;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009963 const void *data_self;
9964 const void *data_sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009965 Py_ssize_t offset;
9966 Py_ssize_t i;
9967 Py_ssize_t end_sub;
9968
9969 if (PyUnicode_READY(self) == -1 ||
9970 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009971 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009972
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009973 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9974 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009975 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009976 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009977
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009978 if (PyUnicode_GET_LENGTH(substring) == 0)
9979 return 1;
9980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009981 kind_self = PyUnicode_KIND(self);
9982 data_self = PyUnicode_DATA(self);
9983 kind_sub = PyUnicode_KIND(substring);
9984 data_sub = PyUnicode_DATA(substring);
9985 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9986
9987 if (direction > 0)
9988 offset = end;
9989 else
9990 offset = start;
9991
9992 if (PyUnicode_READ(kind_self, data_self, offset) ==
9993 PyUnicode_READ(kind_sub, data_sub, 0) &&
9994 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9995 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9996 /* If both are of the same kind, memcmp is sufficient */
9997 if (kind_self == kind_sub) {
9998 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009999 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010000 data_sub,
10001 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010002 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010003 }
Martin Pantere26da7c2016-06-02 10:07:09 +000010004 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010005 else {
10006 /* We do not need to compare 0 and len(substring)-1 because
10007 the if statement above ensured already that they are equal
10008 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010009 for (i = 1; i < end_sub; ++i) {
10010 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
10011 PyUnicode_READ(kind_sub, data_sub, i))
10012 return 0;
10013 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010014 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010015 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010016 }
10017
10018 return 0;
10019}
10020
Alexander Belopolsky40018472011-02-26 01:02:56 +000010021Py_ssize_t
10022PyUnicode_Tailmatch(PyObject *str,
10023 PyObject *substr,
10024 Py_ssize_t start,
10025 Py_ssize_t end,
10026 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010027{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010028 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010029 return -1;
Tim Petersced69f82003-09-16 20:30:58 +000010030
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010031 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010032}
10033
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010034static PyObject *
10035ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010036{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010037 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010038 const char *data = PyUnicode_DATA(self);
10039 char *resdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010040 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +000010041
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010042 res = PyUnicode_New(len, 127);
10043 if (res == NULL)
10044 return NULL;
10045 resdata = PyUnicode_DATA(res);
10046 if (lower)
10047 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010048 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010049 _Py_bytes_upper(resdata, data, len);
10050 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010051}
10052
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010053static Py_UCS4
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010054handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010055{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010056 Py_ssize_t j;
10057 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010010058 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010059 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +000010060
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010061 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
10062
10063 where ! is a negation and \p{xxx} is a character with property xxx.
10064 */
10065 for (j = i - 1; j >= 0; j--) {
10066 c = PyUnicode_READ(kind, data, j);
10067 if (!_PyUnicode_IsCaseIgnorable(c))
10068 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010069 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010070 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
10071 if (final_sigma) {
10072 for (j = i + 1; j < length; j++) {
10073 c = PyUnicode_READ(kind, data, j);
10074 if (!_PyUnicode_IsCaseIgnorable(c))
10075 break;
10076 }
10077 final_sigma = j == length || !_PyUnicode_IsCased(c);
10078 }
10079 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010080}
10081
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010082static int
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010083lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010084 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010085{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010086 /* Obscure special case. */
10087 if (c == 0x3A3) {
10088 mapped[0] = handle_capital_sigma(kind, data, length, i);
10089 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010090 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010091 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010092}
10093
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010094static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010095do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010096{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010097 Py_ssize_t i, k = 0;
10098 int n_res, j;
10099 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +000010100
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010101 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +010010102 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010103 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010104 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010105 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +000010106 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010107 for (i = 1; i < length; i++) {
10108 c = PyUnicode_READ(kind, data, i);
10109 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10110 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010111 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010112 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000010113 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000010114 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010115 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010116}
10117
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010118static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010119do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010120 Py_ssize_t i, k = 0;
10121
10122 for (i = 0; i < length; i++) {
10123 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10124 int n_res, j;
10125 if (Py_UNICODE_ISUPPER(c)) {
10126 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10127 }
10128 else if (Py_UNICODE_ISLOWER(c)) {
10129 n_res = _PyUnicode_ToUpperFull(c, mapped);
10130 }
10131 else {
10132 n_res = 1;
10133 mapped[0] = c;
10134 }
10135 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010136 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010137 res[k++] = mapped[j];
10138 }
10139 }
10140 return k;
10141}
10142
10143static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010144do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010145 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010146{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010147 Py_ssize_t i, k = 0;
10148
10149 for (i = 0; i < length; i++) {
10150 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10151 int n_res, j;
10152 if (lower)
10153 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10154 else
10155 n_res = _PyUnicode_ToUpperFull(c, mapped);
10156 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010157 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010158 res[k++] = mapped[j];
10159 }
10160 }
10161 return k;
10162}
10163
10164static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010165do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010166{
10167 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
10168}
10169
10170static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010171do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010172{
10173 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
10174}
10175
Benjamin Petersone51757f2012-01-12 21:10:29 -050010176static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010177do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersond5890c82012-01-14 13:23:30 -050010178{
10179 Py_ssize_t i, k = 0;
10180
10181 for (i = 0; i < length; i++) {
10182 Py_UCS4 c = PyUnicode_READ(kind, data, i);
10183 Py_UCS4 mapped[3];
10184 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10185 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010186 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010187 res[k++] = mapped[j];
10188 }
10189 }
10190 return k;
10191}
10192
10193static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010194do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersone51757f2012-01-12 21:10:29 -050010195{
10196 Py_ssize_t i, k = 0;
10197 int previous_is_cased;
10198
10199 previous_is_cased = 0;
10200 for (i = 0; i < length; i++) {
10201 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10202 Py_UCS4 mapped[3];
10203 int n_res, j;
10204
10205 if (previous_is_cased)
10206 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10207 else
10208 n_res = _PyUnicode_ToTitleFull(c, mapped);
10209
10210 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010211 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -050010212 res[k++] = mapped[j];
10213 }
10214
10215 previous_is_cased = _PyUnicode_IsCased(c);
10216 }
10217 return k;
10218}
10219
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010220static PyObject *
10221case_operation(PyObject *self,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010222 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010223{
10224 PyObject *res = NULL;
10225 Py_ssize_t length, newlength = 0;
10226 int kind, outkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010227 const void *data;
10228 void *outdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010229 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10230
Benjamin Petersoneea48462012-01-16 14:28:50 -050010231 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010232
10233 kind = PyUnicode_KIND(self);
10234 data = PyUnicode_DATA(self);
10235 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010236 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010237 PyErr_SetString(PyExc_OverflowError, "string is too long");
10238 return NULL;
10239 }
Victor Stinner00d7abd2020-12-01 09:56:42 +010010240 tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010241 if (tmp == NULL)
10242 return PyErr_NoMemory();
10243 newlength = perform(kind, data, length, tmp, &maxchar);
10244 res = PyUnicode_New(newlength, maxchar);
10245 if (res == NULL)
10246 goto leave;
10247 tmpend = tmp + newlength;
10248 outdata = PyUnicode_DATA(res);
10249 outkind = PyUnicode_KIND(res);
10250 switch (outkind) {
10251 case PyUnicode_1BYTE_KIND:
10252 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10253 break;
10254 case PyUnicode_2BYTE_KIND:
10255 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10256 break;
10257 case PyUnicode_4BYTE_KIND:
10258 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10259 break;
10260 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010261 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010262 }
10263 leave:
Victor Stinner00d7abd2020-12-01 09:56:42 +010010264 PyMem_Free(tmp);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010265 return res;
10266}
10267
Tim Peters8ce9f162004-08-27 01:49:32 +000010268PyObject *
10269PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010270{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010271 PyObject *res;
10272 PyObject *fseq;
10273 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010274 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010275
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010276 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010277 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010278 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010279 }
10280
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010281 /* NOTE: the following code can't call back into Python code,
10282 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010283 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010284
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010285 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010286 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010287 res = _PyUnicode_JoinArray(separator, items, seqlen);
10288 Py_DECREF(fseq);
10289 return res;
10290}
10291
10292PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010293_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010294{
10295 PyObject *res = NULL; /* the result */
10296 PyObject *sep = NULL;
10297 Py_ssize_t seplen;
10298 PyObject *item;
10299 Py_ssize_t sz, i, res_offset;
10300 Py_UCS4 maxchar;
10301 Py_UCS4 item_maxchar;
10302 int use_memcpy;
10303 unsigned char *res_data = NULL, *sep_data = NULL;
10304 PyObject *last_obj;
10305 unsigned int kind = 0;
10306
Tim Peters05eba1f2004-08-27 21:32:02 +000010307 /* If empty sequence, return u"". */
10308 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010309 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010310 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010311
Tim Peters05eba1f2004-08-27 21:32:02 +000010312 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010313 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010314 if (seqlen == 1) {
10315 if (PyUnicode_CheckExact(items[0])) {
10316 res = items[0];
10317 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010318 return res;
10319 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010320 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010321 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010322 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010323 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010324 /* Set up sep and seplen */
10325 if (separator == NULL) {
10326 /* fall back to a blank space separator */
10327 sep = PyUnicode_FromOrdinal(' ');
10328 if (!sep)
10329 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010330 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010331 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010332 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010333 else {
10334 if (!PyUnicode_Check(separator)) {
10335 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010336 "separator: expected str instance,"
10337 " %.80s found",
10338 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010339 goto onError;
10340 }
10341 if (PyUnicode_READY(separator))
10342 goto onError;
10343 sep = separator;
10344 seplen = PyUnicode_GET_LENGTH(separator);
10345 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10346 /* inc refcount to keep this code path symmetric with the
10347 above case of a blank separator */
10348 Py_INCREF(sep);
10349 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010350 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010351 }
10352
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010353 /* There are at least two things to join, or else we have a subclass
10354 * of str in the sequence.
10355 * Do a pre-pass to figure out the total amount of space we'll
10356 * need (sz), and see whether all argument are strings.
10357 */
10358 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010359#ifdef Py_DEBUG
10360 use_memcpy = 0;
10361#else
10362 use_memcpy = 1;
10363#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010364 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010365 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010366 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010367 if (!PyUnicode_Check(item)) {
10368 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010369 "sequence item %zd: expected str instance,"
10370 " %.80s found",
10371 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010372 goto onError;
10373 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010374 if (PyUnicode_READY(item) == -1)
10375 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010376 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010378 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010379 if (i != 0) {
10380 add_sz += seplen;
10381 }
10382 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010383 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010384 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010385 goto onError;
10386 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010387 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010388 if (use_memcpy && last_obj != NULL) {
10389 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10390 use_memcpy = 0;
10391 }
10392 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010393 }
Tim Petersced69f82003-09-16 20:30:58 +000010394
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010395 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010396 if (res == NULL)
10397 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010398
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010399 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010400#ifdef Py_DEBUG
10401 use_memcpy = 0;
10402#else
10403 if (use_memcpy) {
10404 res_data = PyUnicode_1BYTE_DATA(res);
10405 kind = PyUnicode_KIND(res);
10406 if (seplen != 0)
10407 sep_data = PyUnicode_1BYTE_DATA(sep);
10408 }
10409#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010410 if (use_memcpy) {
10411 for (i = 0; i < seqlen; ++i) {
10412 Py_ssize_t itemlen;
10413 item = items[i];
10414
10415 /* Copy item, and maybe the separator. */
10416 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010417 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010418 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010419 kind * seplen);
10420 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010421 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010422
10423 itemlen = PyUnicode_GET_LENGTH(item);
10424 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010425 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010426 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010427 kind * itemlen);
10428 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010429 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010430 }
10431 assert(res_data == PyUnicode_1BYTE_DATA(res)
10432 + kind * PyUnicode_GET_LENGTH(res));
10433 }
10434 else {
10435 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10436 Py_ssize_t itemlen;
10437 item = items[i];
10438
10439 /* Copy item, and maybe the separator. */
10440 if (i && seplen != 0) {
10441 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10442 res_offset += seplen;
10443 }
10444
10445 itemlen = PyUnicode_GET_LENGTH(item);
10446 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010447 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010448 res_offset += itemlen;
10449 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010450 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010451 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010452 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010453
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010455 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010456 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010457
Benjamin Peterson29060642009-01-31 22:14:21 +000010458 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010459 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010460 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010461 return NULL;
10462}
10463
Victor Stinnerd3f08822012-05-29 12:57:52 +020010464void
10465_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10466 Py_UCS4 fill_char)
10467{
10468 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010469 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010470 assert(PyUnicode_IS_READY(unicode));
10471 assert(unicode_modifiable(unicode));
10472 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10473 assert(start >= 0);
10474 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010475 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010476}
10477
Victor Stinner3fe55312012-01-04 00:33:50 +010010478Py_ssize_t
10479PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10480 Py_UCS4 fill_char)
10481{
10482 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010483
10484 if (!PyUnicode_Check(unicode)) {
10485 PyErr_BadInternalCall();
10486 return -1;
10487 }
10488 if (PyUnicode_READY(unicode) == -1)
10489 return -1;
10490 if (unicode_check_modifiable(unicode))
10491 return -1;
10492
Victor Stinnerd3f08822012-05-29 12:57:52 +020010493 if (start < 0) {
10494 PyErr_SetString(PyExc_IndexError, "string index out of range");
10495 return -1;
10496 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010497 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10498 PyErr_SetString(PyExc_ValueError,
10499 "fill character is bigger than "
10500 "the string maximum character");
10501 return -1;
10502 }
10503
10504 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10505 length = Py_MIN(maxlen, length);
10506 if (length <= 0)
10507 return 0;
10508
Victor Stinnerd3f08822012-05-29 12:57:52 +020010509 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010510 return length;
10511}
10512
Victor Stinner9310abb2011-10-05 00:59:23 +020010513static PyObject *
10514pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010515 Py_ssize_t left,
10516 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010517 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010518{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010519 PyObject *u;
10520 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010521 int kind;
10522 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010523
10524 if (left < 0)
10525 left = 0;
10526 if (right < 0)
10527 right = 0;
10528
Victor Stinnerc4b49542011-12-11 22:44:26 +010010529 if (left == 0 && right == 0)
10530 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010531
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010532 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10533 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010534 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10535 return NULL;
10536 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010537 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010538 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010539 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010540 if (!u)
10541 return NULL;
10542
10543 kind = PyUnicode_KIND(u);
10544 data = PyUnicode_DATA(u);
10545 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010546 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010547 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010548 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010549 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010550 assert(_PyUnicode_CheckConsistency(u, 1));
10551 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010552}
10553
Alexander Belopolsky40018472011-02-26 01:02:56 +000010554PyObject *
10555PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010556{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010557 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010558
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010559 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010560 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010561
Benjamin Petersonead6b532011-12-20 17:23:42 -060010562 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010563 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010564 if (PyUnicode_IS_ASCII(string))
10565 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010566 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010567 PyUnicode_GET_LENGTH(string), keepends);
10568 else
10569 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010570 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010571 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 break;
10573 case PyUnicode_2BYTE_KIND:
10574 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010575 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 PyUnicode_GET_LENGTH(string), keepends);
10577 break;
10578 case PyUnicode_4BYTE_KIND:
10579 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010580 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581 PyUnicode_GET_LENGTH(string), keepends);
10582 break;
10583 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010584 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010586 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010587}
10588
Alexander Belopolsky40018472011-02-26 01:02:56 +000010589static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010590split(PyObject *self,
10591 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010592 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010593{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010594 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010595 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 Py_ssize_t len1, len2;
10597 PyObject* out;
10598
Guido van Rossumd57fd912000-03-10 22:53:23 +000010599 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010600 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010601
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 if (PyUnicode_READY(self) == -1)
10603 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010604
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010606 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010608 if (PyUnicode_IS_ASCII(self))
10609 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010610 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010611 PyUnicode_GET_LENGTH(self), maxcount
10612 );
10613 else
10614 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010615 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010616 PyUnicode_GET_LENGTH(self), maxcount
10617 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 case PyUnicode_2BYTE_KIND:
10619 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010620 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 PyUnicode_GET_LENGTH(self), maxcount
10622 );
10623 case PyUnicode_4BYTE_KIND:
10624 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010625 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010626 PyUnicode_GET_LENGTH(self), maxcount
10627 );
10628 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010629 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010630 }
10631
10632 if (PyUnicode_READY(substring) == -1)
10633 return NULL;
10634
10635 kind1 = PyUnicode_KIND(self);
10636 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010637 len1 = PyUnicode_GET_LENGTH(self);
10638 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010639 if (kind1 < kind2 || len1 < len2) {
10640 out = PyList_New(1);
10641 if (out == NULL)
10642 return NULL;
10643 Py_INCREF(self);
10644 PyList_SET_ITEM(out, 0, self);
10645 return out;
10646 }
10647 buf1 = PyUnicode_DATA(self);
10648 buf2 = PyUnicode_DATA(substring);
10649 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010650 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010651 if (!buf2)
10652 return NULL;
10653 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010655 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010657 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10658 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010659 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010660 else
10661 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010662 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663 break;
10664 case PyUnicode_2BYTE_KIND:
10665 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010666 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 break;
10668 case PyUnicode_4BYTE_KIND:
10669 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010670 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010671 break;
10672 default:
10673 out = NULL;
10674 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010675 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010676 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010677 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010678 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010679}
10680
Alexander Belopolsky40018472011-02-26 01:02:56 +000010681static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010682rsplit(PyObject *self,
10683 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010684 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010685{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010686 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010687 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010688 Py_ssize_t len1, len2;
10689 PyObject* out;
10690
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010691 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010692 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010693
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010694 if (PyUnicode_READY(self) == -1)
10695 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010697 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010698 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010699 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010700 if (PyUnicode_IS_ASCII(self))
10701 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010702 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010703 PyUnicode_GET_LENGTH(self), maxcount
10704 );
10705 else
10706 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010707 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010708 PyUnicode_GET_LENGTH(self), maxcount
10709 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010710 case PyUnicode_2BYTE_KIND:
10711 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010712 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010713 PyUnicode_GET_LENGTH(self), maxcount
10714 );
10715 case PyUnicode_4BYTE_KIND:
10716 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010717 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010718 PyUnicode_GET_LENGTH(self), maxcount
10719 );
10720 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010721 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010722 }
10723
10724 if (PyUnicode_READY(substring) == -1)
10725 return NULL;
10726
10727 kind1 = PyUnicode_KIND(self);
10728 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010729 len1 = PyUnicode_GET_LENGTH(self);
10730 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010731 if (kind1 < kind2 || len1 < len2) {
10732 out = PyList_New(1);
10733 if (out == NULL)
10734 return NULL;
10735 Py_INCREF(self);
10736 PyList_SET_ITEM(out, 0, self);
10737 return out;
10738 }
10739 buf1 = PyUnicode_DATA(self);
10740 buf2 = PyUnicode_DATA(substring);
10741 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010742 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010743 if (!buf2)
10744 return NULL;
10745 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010746
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010747 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010748 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010749 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10750 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010751 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010752 else
10753 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010754 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010755 break;
10756 case PyUnicode_2BYTE_KIND:
10757 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010758 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010759 break;
10760 case PyUnicode_4BYTE_KIND:
10761 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010762 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010763 break;
10764 default:
10765 out = NULL;
10766 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010767 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010768 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010769 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010770 return out;
10771}
10772
10773static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010774anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10775 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010776{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010777 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010778 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010779 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10780 return asciilib_find(buf1, len1, buf2, len2, offset);
10781 else
10782 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010783 case PyUnicode_2BYTE_KIND:
10784 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10785 case PyUnicode_4BYTE_KIND:
10786 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10787 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010788 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010789}
10790
10791static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010792anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10793 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010794{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010795 switch (kind) {
10796 case PyUnicode_1BYTE_KIND:
10797 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10798 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10799 else
10800 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10801 case PyUnicode_2BYTE_KIND:
10802 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10803 case PyUnicode_4BYTE_KIND:
10804 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10805 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010806 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010807}
10808
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010809static void
10810replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10811 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10812{
10813 int kind = PyUnicode_KIND(u);
10814 void *data = PyUnicode_DATA(u);
10815 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10816 if (kind == PyUnicode_1BYTE_KIND) {
10817 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10818 (Py_UCS1 *)data + len,
10819 u1, u2, maxcount);
10820 }
10821 else if (kind == PyUnicode_2BYTE_KIND) {
10822 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10823 (Py_UCS2 *)data + len,
10824 u1, u2, maxcount);
10825 }
10826 else {
10827 assert(kind == PyUnicode_4BYTE_KIND);
10828 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10829 (Py_UCS4 *)data + len,
10830 u1, u2, maxcount);
10831 }
10832}
10833
Alexander Belopolsky40018472011-02-26 01:02:56 +000010834static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010835replace(PyObject *self, PyObject *str1,
10836 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010837{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010838 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010839 const char *sbuf = PyUnicode_DATA(self);
10840 const void *buf1 = PyUnicode_DATA(str1);
10841 const void *buf2 = PyUnicode_DATA(str2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010842 int srelease = 0, release1 = 0, release2 = 0;
10843 int skind = PyUnicode_KIND(self);
10844 int kind1 = PyUnicode_KIND(str1);
10845 int kind2 = PyUnicode_KIND(str2);
10846 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10847 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10848 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010849 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010850 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010851
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010852 if (slen < len1)
10853 goto nothing;
10854
Guido van Rossumd57fd912000-03-10 22:53:23 +000010855 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010856 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010857 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010858 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010859
Victor Stinner59de0ee2011-10-07 10:01:28 +020010860 if (str1 == str2)
10861 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010862
Victor Stinner49a0a212011-10-12 23:46:10 +020010863 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010864 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10865 if (maxchar < maxchar_str1)
10866 /* substring too wide to be present */
10867 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010868 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10869 /* Replacing str1 with str2 may cause a maxchar reduction in the
10870 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010871 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010872 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010873
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010874 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010875 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010876 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010877 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010878 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010879 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010880 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010881 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010882
Victor Stinner69ed0f42013-04-09 21:48:24 +020010883 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010884 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010885 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010886 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010887 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010888 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010889 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010890 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010891
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010892 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10893 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010894 }
10895 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010896 int rkind = skind;
10897 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010898 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010900 if (kind1 < rkind) {
10901 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010902 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010903 if (!buf1) goto error;
10904 release1 = 1;
10905 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010906 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010907 if (i < 0)
10908 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010909 if (rkind > kind2) {
10910 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010911 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010912 if (!buf2) goto error;
10913 release2 = 1;
10914 }
10915 else if (rkind < kind2) {
10916 /* widen self and buf1 */
10917 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010918 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010919 assert(buf1 != PyUnicode_DATA(str1));
10920 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010921 buf1 = PyUnicode_DATA(str1);
10922 release1 = 0;
10923 }
10924 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010925 if (!sbuf) goto error;
10926 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010927 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010928 if (!buf1) goto error;
10929 release1 = 1;
10930 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010931 u = PyUnicode_New(slen, maxchar);
10932 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010933 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010934 assert(PyUnicode_KIND(u) == rkind);
10935 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010936
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010937 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010938 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010939 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010940 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010941 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010942 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010943
10944 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010945 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010946 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010947 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010948 if (i == -1)
10949 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010950 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010951 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010952 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010953 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010954 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010955 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010956 }
10957 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010958 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010959 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010960 int rkind = skind;
10961 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010963 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010964 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010965 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010966 if (!buf1) goto error;
10967 release1 = 1;
10968 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010969 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010970 if (n == 0)
10971 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010972 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010973 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010974 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010975 if (!buf2) goto error;
10976 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010977 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010978 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010979 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010980 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010981 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010982 if (!sbuf) goto error;
10983 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010984 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010985 assert(buf1 != PyUnicode_DATA(str1));
10986 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010987 buf1 = PyUnicode_DATA(str1);
10988 release1 = 0;
10989 }
10990 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010991 if (!buf1) goto error;
10992 release1 = 1;
10993 }
10994 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10995 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010996 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010997 PyErr_SetString(PyExc_OverflowError,
10998 "replace string is too long");
10999 goto error;
11000 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010011001 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020011002 if (new_size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +020011003 u = unicode_new_empty();
Victor Stinner49a0a212011-10-12 23:46:10 +020011004 goto done;
11005 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080011006 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011007 PyErr_SetString(PyExc_OverflowError,
11008 "replace string is too long");
11009 goto error;
11010 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011011 u = PyUnicode_New(new_size, maxchar);
11012 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011013 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020011014 assert(PyUnicode_KIND(u) == rkind);
11015 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011016 ires = i = 0;
11017 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011018 while (n-- > 0) {
11019 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020011020 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011021 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020011022 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000011023 if (j == -1)
11024 break;
11025 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011026 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011027 memcpy(res + rkind * ires,
11028 sbuf + rkind * i,
11029 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011030 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011031 }
11032 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011033 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011034 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011035 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011036 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011037 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011038 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011039 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011040 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011041 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011042 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011043 memcpy(res + rkind * ires,
11044 sbuf + rkind * i,
11045 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020011046 }
11047 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011048 /* interleave */
11049 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011050 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011051 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011052 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011053 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011054 if (--n <= 0)
11055 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011056 memcpy(res + rkind * ires,
11057 sbuf + rkind * i,
11058 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011059 ires++;
11060 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011061 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011062 memcpy(res + rkind * ires,
11063 sbuf + rkind * i,
11064 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011065 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011066 }
11067
11068 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020011069 unicode_adjust_maxchar(&u);
11070 if (u == NULL)
11071 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011072 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011073
11074 done:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011075 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11076 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11077 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011078 if (srelease)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011079 PyMem_Free((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011080 if (release1)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011081 PyMem_Free((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011082 if (release2)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011083 PyMem_Free((void *)buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011084 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011085 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011086
Benjamin Peterson29060642009-01-31 22:14:21 +000011087 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000011088 /* nothing to replace; return original string (when possible) */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011089 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11090 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11091 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011092 if (srelease)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011093 PyMem_Free((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011094 if (release1)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011095 PyMem_Free((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011096 if (release2)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011097 PyMem_Free((void *)buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010011098 return unicode_result_unchanged(self);
11099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011100 error:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011101 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11102 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11103 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11104 if (srelease)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011105 PyMem_Free((void *)sbuf);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011106 if (release1)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011107 PyMem_Free((void *)buf1);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011108 if (release2)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011109 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011110 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011111}
11112
11113/* --- Unicode Object Methods --------------------------------------------- */
11114
INADA Naoki3ae20562017-01-16 20:41:20 +090011115/*[clinic input]
11116str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000011117
INADA Naoki3ae20562017-01-16 20:41:20 +090011118Return a version of the string where each word is titlecased.
11119
11120More specifically, words start with uppercased characters and all remaining
11121cased characters have lower case.
11122[clinic start generated code]*/
11123
11124static PyObject *
11125unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011126/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011127{
Benjamin Petersoneea48462012-01-16 14:28:50 -050011128 if (PyUnicode_READY(self) == -1)
11129 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011130 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011131}
11132
INADA Naoki3ae20562017-01-16 20:41:20 +090011133/*[clinic input]
11134str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000011135
INADA Naoki3ae20562017-01-16 20:41:20 +090011136Return a capitalized version of the string.
11137
11138More specifically, make the first character have upper case and the rest lower
11139case.
11140[clinic start generated code]*/
11141
11142static PyObject *
11143unicode_capitalize_impl(PyObject *self)
11144/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011145{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011146 if (PyUnicode_READY(self) == -1)
11147 return NULL;
11148 if (PyUnicode_GET_LENGTH(self) == 0)
11149 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011150 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011151}
11152
INADA Naoki3ae20562017-01-16 20:41:20 +090011153/*[clinic input]
11154str.casefold as unicode_casefold
11155
11156Return a version of the string suitable for caseless comparisons.
11157[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011158
11159static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011160unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011161/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011162{
11163 if (PyUnicode_READY(self) == -1)
11164 return NULL;
11165 if (PyUnicode_IS_ASCII(self))
11166 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011167 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050011168}
11169
11170
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011171/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011172
11173static int
11174convert_uc(PyObject *obj, void *addr)
11175{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011176 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011177
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011178 if (!PyUnicode_Check(obj)) {
11179 PyErr_Format(PyExc_TypeError,
11180 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020011181 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011182 return 0;
11183 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011184 if (PyUnicode_READY(obj) < 0)
11185 return 0;
11186 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011187 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011188 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000011189 return 0;
11190 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011191 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011192 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011193}
11194
INADA Naoki3ae20562017-01-16 20:41:20 +090011195/*[clinic input]
11196str.center as unicode_center
11197
11198 width: Py_ssize_t
11199 fillchar: Py_UCS4 = ' '
11200 /
11201
11202Return a centered string of length width.
11203
11204Padding is done using the specified fill character (default is a space).
11205[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011206
11207static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011208unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11209/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011210{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011211 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011212
Benjamin Petersonbac79492012-01-14 13:34:47 -050011213 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011214 return NULL;
11215
Victor Stinnerc4b49542011-12-11 22:44:26 +010011216 if (PyUnicode_GET_LENGTH(self) >= width)
11217 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011218
Victor Stinnerc4b49542011-12-11 22:44:26 +010011219 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011220 left = marg / 2 + (marg & width & 1);
11221
Victor Stinner9310abb2011-10-05 00:59:23 +020011222 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011223}
11224
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011225/* This function assumes that str1 and str2 are readied by the caller. */
11226
Marc-André Lemburge5034372000-08-08 08:04:29 +000011227static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011228unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000011229{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011230#define COMPARE(TYPE1, TYPE2) \
11231 do { \
11232 TYPE1* p1 = (TYPE1 *)data1; \
11233 TYPE2* p2 = (TYPE2 *)data2; \
11234 TYPE1* end = p1 + len; \
11235 Py_UCS4 c1, c2; \
11236 for (; p1 != end; p1++, p2++) { \
11237 c1 = *p1; \
11238 c2 = *p2; \
11239 if (c1 != c2) \
11240 return (c1 < c2) ? -1 : 1; \
11241 } \
11242 } \
11243 while (0)
11244
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011245 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011246 const void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011247 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011248
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011249 kind1 = PyUnicode_KIND(str1);
11250 kind2 = PyUnicode_KIND(str2);
11251 data1 = PyUnicode_DATA(str1);
11252 data2 = PyUnicode_DATA(str2);
11253 len1 = PyUnicode_GET_LENGTH(str1);
11254 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011255 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011256
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011257 switch(kind1) {
11258 case PyUnicode_1BYTE_KIND:
11259 {
11260 switch(kind2) {
11261 case PyUnicode_1BYTE_KIND:
11262 {
11263 int cmp = memcmp(data1, data2, len);
11264 /* normalize result of memcmp() into the range [-1; 1] */
11265 if (cmp < 0)
11266 return -1;
11267 if (cmp > 0)
11268 return 1;
11269 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011270 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011271 case PyUnicode_2BYTE_KIND:
11272 COMPARE(Py_UCS1, Py_UCS2);
11273 break;
11274 case PyUnicode_4BYTE_KIND:
11275 COMPARE(Py_UCS1, Py_UCS4);
11276 break;
11277 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011278 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011279 }
11280 break;
11281 }
11282 case PyUnicode_2BYTE_KIND:
11283 {
11284 switch(kind2) {
11285 case PyUnicode_1BYTE_KIND:
11286 COMPARE(Py_UCS2, Py_UCS1);
11287 break;
11288 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011289 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011290 COMPARE(Py_UCS2, Py_UCS2);
11291 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011292 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011293 case PyUnicode_4BYTE_KIND:
11294 COMPARE(Py_UCS2, Py_UCS4);
11295 break;
11296 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011297 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011298 }
11299 break;
11300 }
11301 case PyUnicode_4BYTE_KIND:
11302 {
11303 switch(kind2) {
11304 case PyUnicode_1BYTE_KIND:
11305 COMPARE(Py_UCS4, Py_UCS1);
11306 break;
11307 case PyUnicode_2BYTE_KIND:
11308 COMPARE(Py_UCS4, Py_UCS2);
11309 break;
11310 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011311 {
11312#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11313 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11314 /* normalize result of wmemcmp() into the range [-1; 1] */
11315 if (cmp < 0)
11316 return -1;
11317 if (cmp > 0)
11318 return 1;
11319#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011320 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011321#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011322 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011323 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011324 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011325 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011326 }
11327 break;
11328 }
11329 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011330 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011331 }
11332
Victor Stinner770e19e2012-10-04 22:59:45 +020011333 if (len1 == len2)
11334 return 0;
11335 if (len1 < len2)
11336 return -1;
11337 else
11338 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011339
11340#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011341}
11342
Benjamin Peterson621b4302016-09-09 13:54:34 -070011343static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011344unicode_compare_eq(PyObject *str1, PyObject *str2)
11345{
11346 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011347 const void *data1, *data2;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011348 Py_ssize_t len;
11349 int cmp;
11350
Victor Stinnere5567ad2012-10-23 02:48:49 +020011351 len = PyUnicode_GET_LENGTH(str1);
11352 if (PyUnicode_GET_LENGTH(str2) != len)
11353 return 0;
11354 kind = PyUnicode_KIND(str1);
11355 if (PyUnicode_KIND(str2) != kind)
11356 return 0;
11357 data1 = PyUnicode_DATA(str1);
11358 data2 = PyUnicode_DATA(str2);
11359
11360 cmp = memcmp(data1, data2, len * kind);
11361 return (cmp == 0);
11362}
11363
11364
Alexander Belopolsky40018472011-02-26 01:02:56 +000011365int
11366PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011367{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011368 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11369 if (PyUnicode_READY(left) == -1 ||
11370 PyUnicode_READY(right) == -1)
11371 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011372
11373 /* a string is equal to itself */
11374 if (left == right)
11375 return 0;
11376
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011377 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011378 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011379 PyErr_Format(PyExc_TypeError,
11380 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011381 Py_TYPE(left)->tp_name,
11382 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011383 return -1;
11384}
11385
Martin v. Löwis5b222132007-06-10 09:51:05 +000011386int
11387PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11388{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011389 Py_ssize_t i;
11390 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011391 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011392 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011393
Victor Stinner910337b2011-10-03 03:20:16 +020011394 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011395 if (!PyUnicode_IS_READY(uni)) {
11396 const wchar_t *ws = _PyUnicode_WSTR(uni);
11397 /* Compare Unicode string and source character set string */
11398 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11399 if (chr != ustr[i])
11400 return (chr < ustr[i]) ? -1 : 1;
11401 }
11402 /* This check keeps Python strings that end in '\0' from comparing equal
11403 to C strings identical up to that point. */
11404 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11405 return 1; /* uni is longer */
11406 if (ustr[i])
11407 return -1; /* str is longer */
11408 return 0;
11409 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011410 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011411 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011412 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011413 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011414 size_t len, len2 = strlen(str);
11415 int cmp;
11416
11417 len = Py_MIN(len1, len2);
11418 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011419 if (cmp != 0) {
11420 if (cmp < 0)
11421 return -1;
11422 else
11423 return 1;
11424 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011425 if (len1 > len2)
11426 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011427 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011428 return -1; /* str is longer */
11429 return 0;
11430 }
11431 else {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011432 const void *data = PyUnicode_DATA(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011433 /* Compare Unicode string and source character set string */
11434 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011435 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011436 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11437 /* This check keeps Python strings that end in '\0' from comparing equal
11438 to C strings identical up to that point. */
11439 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11440 return 1; /* uni is longer */
11441 if (str[i])
11442 return -1; /* str is longer */
11443 return 0;
11444 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011445}
11446
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011447static int
11448non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11449{
11450 size_t i, len;
11451 const wchar_t *p;
11452 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11453 if (strlen(str) != len)
11454 return 0;
11455 p = _PyUnicode_WSTR(unicode);
11456 assert(p);
11457 for (i = 0; i < len; i++) {
11458 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011459 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011460 return 0;
11461 }
11462 return 1;
11463}
11464
11465int
11466_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11467{
11468 size_t len;
11469 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011470 assert(str);
11471#ifndef NDEBUG
11472 for (const char *p = str; *p; p++) {
11473 assert((unsigned char)*p < 128);
11474 }
11475#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011476 if (PyUnicode_READY(unicode) == -1) {
11477 /* Memory error or bad data */
11478 PyErr_Clear();
11479 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11480 }
11481 if (!PyUnicode_IS_ASCII(unicode))
11482 return 0;
11483 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11484 return strlen(str) == len &&
11485 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11486}
11487
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011488int
11489_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11490{
11491 PyObject *right_uni;
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011492
11493 assert(_PyUnicode_CHECK(left));
11494 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011495#ifndef NDEBUG
11496 for (const char *p = right->string; *p; p++) {
11497 assert((unsigned char)*p < 128);
11498 }
11499#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011500
11501 if (PyUnicode_READY(left) == -1) {
11502 /* memory error or bad data */
11503 PyErr_Clear();
11504 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11505 }
11506
11507 if (!PyUnicode_IS_ASCII(left))
11508 return 0;
11509
11510 right_uni = _PyUnicode_FromId(right); /* borrowed */
11511 if (right_uni == NULL) {
11512 /* memory error or bad data */
11513 PyErr_Clear();
11514 return _PyUnicode_EqualToASCIIString(left, right->string);
11515 }
11516
11517 if (left == right_uni)
11518 return 1;
11519
11520 if (PyUnicode_CHECK_INTERNED(left))
11521 return 0;
11522
INADA Naoki7cc95f52018-01-28 02:07:09 +090011523 assert(_PyUnicode_HASH(right_uni) != -1);
Victor Stinner607b1022020-05-05 18:50:30 +020011524 Py_hash_t hash = _PyUnicode_HASH(left);
Victor Stinnerea251802020-12-26 02:58:33 +010011525 if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) {
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011526 return 0;
Victor Stinnerea251802020-12-26 02:58:33 +010011527 }
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011528
11529 return unicode_compare_eq(left, right_uni);
11530}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011531
Alexander Belopolsky40018472011-02-26 01:02:56 +000011532PyObject *
11533PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011534{
11535 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011536
Victor Stinnere5567ad2012-10-23 02:48:49 +020011537 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11538 Py_RETURN_NOTIMPLEMENTED;
11539
11540 if (PyUnicode_READY(left) == -1 ||
11541 PyUnicode_READY(right) == -1)
11542 return NULL;
11543
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011544 if (left == right) {
11545 switch (op) {
11546 case Py_EQ:
11547 case Py_LE:
11548 case Py_GE:
11549 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011550 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011551 case Py_NE:
11552 case Py_LT:
11553 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011554 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011555 default:
11556 PyErr_BadArgument();
11557 return NULL;
11558 }
11559 }
11560 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011561 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011562 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011563 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011564 }
11565 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011566 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011567 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011568 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011569}
11570
Alexander Belopolsky40018472011-02-26 01:02:56 +000011571int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011572_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11573{
11574 return unicode_eq(aa, bb);
11575}
11576
11577int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011578PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011579{
Victor Stinner77282cb2013-04-14 19:22:47 +020011580 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011581 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011582 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011583 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011584
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011585 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011586 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011587 "'in <string>' requires string as left operand, not %.100s",
11588 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011589 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011590 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011591 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011592 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011593 if (ensure_unicode(str) < 0)
11594 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011595
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011596 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011597 kind2 = PyUnicode_KIND(substr);
11598 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011599 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011600 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011601 len2 = PyUnicode_GET_LENGTH(substr);
11602 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011603 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011604 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011605 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011606 if (len2 == 1) {
11607 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11608 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011609 return result;
11610 }
11611 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011612 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011613 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011614 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011615 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011616
Victor Stinner77282cb2013-04-14 19:22:47 +020011617 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011618 case PyUnicode_1BYTE_KIND:
11619 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11620 break;
11621 case PyUnicode_2BYTE_KIND:
11622 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11623 break;
11624 case PyUnicode_4BYTE_KIND:
11625 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11626 break;
11627 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011628 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011629 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011630
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011631 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
Victor Stinner77282cb2013-04-14 19:22:47 +020011632 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011633 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011634
Guido van Rossum403d68b2000-03-13 15:55:09 +000011635 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011636}
11637
Guido van Rossumd57fd912000-03-10 22:53:23 +000011638/* Concat to string or Unicode object giving a new Unicode object. */
11639
Alexander Belopolsky40018472011-02-26 01:02:56 +000011640PyObject *
11641PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011642{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011643 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011644 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011645 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011646
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011647 if (ensure_unicode(left) < 0)
11648 return NULL;
11649
11650 if (!PyUnicode_Check(right)) {
11651 PyErr_Format(PyExc_TypeError,
11652 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011653 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011654 return NULL;
11655 }
11656 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011657 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011658
11659 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011660 PyObject *empty = unicode_get_empty(); // Borrowed reference
11661 if (left == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011662 return PyUnicode_FromObject(right);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011663 }
11664 if (right == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011665 return PyUnicode_FromObject(left);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011666 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011667
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011668 left_len = PyUnicode_GET_LENGTH(left);
11669 right_len = PyUnicode_GET_LENGTH(right);
11670 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011671 PyErr_SetString(PyExc_OverflowError,
11672 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011673 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011674 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011675 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011676
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011677 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11678 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011679 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011680
Guido van Rossumd57fd912000-03-10 22:53:23 +000011681 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011682 result = PyUnicode_New(new_len, maxchar);
11683 if (result == NULL)
11684 return NULL;
11685 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11686 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11687 assert(_PyUnicode_CheckConsistency(result, 1));
11688 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689}
11690
Walter Dörwald1ab83302007-05-18 17:15:44 +000011691void
Victor Stinner23e56682011-10-03 03:54:37 +020011692PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011693{
Victor Stinner23e56682011-10-03 03:54:37 +020011694 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011695 Py_UCS4 maxchar, maxchar2;
11696 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011697
11698 if (p_left == NULL) {
11699 if (!PyErr_Occurred())
11700 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011701 return;
11702 }
Victor Stinner23e56682011-10-03 03:54:37 +020011703 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011704 if (right == NULL || left == NULL
11705 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011706 if (!PyErr_Occurred())
11707 PyErr_BadInternalCall();
11708 goto error;
11709 }
11710
Benjamin Petersonbac79492012-01-14 13:34:47 -050011711 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011712 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011713 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011714 goto error;
11715
Victor Stinner488fa492011-12-12 00:01:39 +010011716 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011717 PyObject *empty = unicode_get_empty(); // Borrowed reference
11718 if (left == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011719 Py_DECREF(left);
11720 Py_INCREF(right);
11721 *p_left = right;
11722 return;
11723 }
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011724 if (right == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011725 return;
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011726 }
Victor Stinner488fa492011-12-12 00:01:39 +010011727
11728 left_len = PyUnicode_GET_LENGTH(left);
11729 right_len = PyUnicode_GET_LENGTH(right);
11730 if (left_len > PY_SSIZE_T_MAX - right_len) {
11731 PyErr_SetString(PyExc_OverflowError,
11732 "strings are too large to concat");
11733 goto error;
11734 }
11735 new_len = left_len + right_len;
11736
11737 if (unicode_modifiable(left)
11738 && PyUnicode_CheckExact(right)
11739 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011740 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11741 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011742 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011743 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011744 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11745 {
11746 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011747 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011748 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011749
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011750 /* copy 'right' into the newly allocated area of 'left' */
11751 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011752 }
Victor Stinner488fa492011-12-12 00:01:39 +010011753 else {
11754 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11755 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011756 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011757
Victor Stinner488fa492011-12-12 00:01:39 +010011758 /* Concat the two Unicode strings */
11759 res = PyUnicode_New(new_len, maxchar);
11760 if (res == NULL)
11761 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011762 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11763 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011764 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011765 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011766 }
11767 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011768 return;
11769
11770error:
Victor Stinner488fa492011-12-12 00:01:39 +010011771 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011772}
11773
11774void
11775PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11776{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011777 PyUnicode_Append(pleft, right);
11778 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011779}
11780
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011781/*
11782Wraps stringlib_parse_args_finds() and additionally ensures that the
11783first argument is a unicode object.
11784*/
11785
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011786static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011787parse_args_finds_unicode(const char * function_name, PyObject *args,
11788 PyObject **substring,
11789 Py_ssize_t *start, Py_ssize_t *end)
11790{
11791 if(stringlib_parse_args_finds(function_name, args, substring,
11792 start, end)) {
11793 if (ensure_unicode(*substring) < 0)
11794 return 0;
11795 return 1;
11796 }
11797 return 0;
11798}
11799
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011800PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011801 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011802\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011803Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011804string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011805interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011806
11807static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011808unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011810 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011811 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011812 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011814 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011815 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011816 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011817
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011818 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011819 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011820
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011821 kind1 = PyUnicode_KIND(self);
11822 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011823 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011824 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011825
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011826 len1 = PyUnicode_GET_LENGTH(self);
11827 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011828 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011829 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011830 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011831
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011832 buf1 = PyUnicode_DATA(self);
11833 buf2 = PyUnicode_DATA(substring);
11834 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011835 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011836 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011837 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011838 }
11839 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011840 case PyUnicode_1BYTE_KIND:
11841 iresult = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011842 ((const Py_UCS1*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011843 buf2, len2, PY_SSIZE_T_MAX
11844 );
11845 break;
11846 case PyUnicode_2BYTE_KIND:
11847 iresult = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011848 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011849 buf2, len2, PY_SSIZE_T_MAX
11850 );
11851 break;
11852 case PyUnicode_4BYTE_KIND:
11853 iresult = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011854 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011855 buf2, len2, PY_SSIZE_T_MAX
11856 );
11857 break;
11858 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011859 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011860 }
11861
11862 result = PyLong_FromSsize_t(iresult);
11863
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011864 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011865 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011866 PyMem_Free((void *)buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011867
Guido van Rossumd57fd912000-03-10 22:53:23 +000011868 return result;
11869}
11870
INADA Naoki3ae20562017-01-16 20:41:20 +090011871/*[clinic input]
11872str.encode as unicode_encode
11873
11874 encoding: str(c_default="NULL") = 'utf-8'
11875 The encoding in which to encode the string.
11876 errors: str(c_default="NULL") = 'strict'
11877 The error handling scheme to use for encoding errors.
11878 The default is 'strict' meaning that encoding errors raise a
11879 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11880 'xmlcharrefreplace' as well as any other name registered with
11881 codecs.register_error that can handle UnicodeEncodeErrors.
11882
11883Encode the string using the codec registered for encoding.
11884[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885
11886static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011887unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011888/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011890 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011891}
11892
INADA Naoki3ae20562017-01-16 20:41:20 +090011893/*[clinic input]
11894str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011895
INADA Naoki3ae20562017-01-16 20:41:20 +090011896 tabsize: int = 8
11897
11898Return a copy where all tab characters are expanded using spaces.
11899
11900If tabsize is not given, a tab size of 8 characters is assumed.
11901[clinic start generated code]*/
11902
11903static PyObject *
11904unicode_expandtabs_impl(PyObject *self, int tabsize)
11905/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011907 Py_ssize_t i, j, line_pos, src_len, incr;
11908 Py_UCS4 ch;
11909 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011910 const void *src_data;
11911 void *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011912 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011913 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914
Antoine Pitrou22425222011-10-04 19:10:51 +020011915 if (PyUnicode_READY(self) == -1)
11916 return NULL;
11917
Thomas Wouters7e474022000-07-16 12:04:32 +000011918 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011919 src_len = PyUnicode_GET_LENGTH(self);
11920 i = j = line_pos = 0;
11921 kind = PyUnicode_KIND(self);
11922 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011923 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011924 for (; i < src_len; i++) {
11925 ch = PyUnicode_READ(kind, src_data, i);
11926 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011927 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011928 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011929 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011930 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011931 goto overflow;
11932 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011933 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011934 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011935 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011937 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011938 goto overflow;
11939 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011940 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011941 if (ch == '\n' || ch == '\r')
11942 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011943 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011944 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011945 if (!found)
11946 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011947
Guido van Rossumd57fd912000-03-10 22:53:23 +000011948 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011949 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011950 if (!u)
11951 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011952 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011953
Antoine Pitroue71d5742011-10-04 15:55:09 +020011954 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955
Antoine Pitroue71d5742011-10-04 15:55:09 +020011956 for (; i < src_len; i++) {
11957 ch = PyUnicode_READ(kind, src_data, i);
11958 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011959 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011960 incr = tabsize - (line_pos % tabsize);
11961 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011962 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011963 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011964 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011965 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011966 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011967 line_pos++;
11968 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011969 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011970 if (ch == '\n' || ch == '\r')
11971 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011972 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011973 }
11974 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011975 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011976
Antoine Pitroue71d5742011-10-04 15:55:09 +020011977 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011978 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11979 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011980}
11981
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011982PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011983 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011984\n\
11985Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011986such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011987arguments start and end are interpreted as in slice notation.\n\
11988\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011989Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011990
11991static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011992unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011993{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011994 /* initialize variables to prevent gcc warning */
11995 PyObject *substring = NULL;
11996 Py_ssize_t start = 0;
11997 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011998 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011999
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012000 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012001 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012002
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012003 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012004 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012006 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012007
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012008 if (result == -2)
12009 return NULL;
12010
Christian Heimes217cfd12007-12-02 14:31:20 +000012011 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012012}
12013
12014static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012015unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012016{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012017 const void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012018 enum PyUnicode_Kind kind;
12019 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012020
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030012021 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012022 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012023 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012024 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030012025 if (PyUnicode_READY(self) == -1) {
12026 return NULL;
12027 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012028 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
12029 PyErr_SetString(PyExc_IndexError, "string index out of range");
12030 return NULL;
12031 }
12032 kind = PyUnicode_KIND(self);
12033 data = PyUnicode_DATA(self);
12034 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010012035 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012036}
12037
Guido van Rossumc2504932007-09-18 19:42:40 +000012038/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010012039 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000012040static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012041unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012042{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080012043 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000012044
Benjamin Petersonf6622c82012-04-09 14:53:07 -040012045#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050012046 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040012047#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012048 if (_PyUnicode_HASH(self) != -1)
12049 return _PyUnicode_HASH(self);
12050 if (PyUnicode_READY(self) == -1)
12051 return -1;
animalizea1d14252019-01-02 20:16:06 +080012052
Christian Heimes985ecdc2013-11-20 11:46:18 +010012053 x = _Py_HashBytes(PyUnicode_DATA(self),
12054 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012055 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000012056 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012057}
12058
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012059PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012060 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012061\n\
oldkaa0735f2018-02-02 16:52:55 +080012062Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012063such that sub is contained within S[start:end]. Optional\n\
12064arguments start and end are interpreted as in slice notation.\n\
12065\n\
12066Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012067
12068static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012069unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012071 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000012072 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012073 PyObject *substring = NULL;
12074 Py_ssize_t start = 0;
12075 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012077 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012078 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012079
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012080 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012081 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012082
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012083 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012085 if (result == -2)
12086 return NULL;
12087
Guido van Rossumd57fd912000-03-10 22:53:23 +000012088 if (result < 0) {
12089 PyErr_SetString(PyExc_ValueError, "substring not found");
12090 return NULL;
12091 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012092
Christian Heimes217cfd12007-12-02 14:31:20 +000012093 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012094}
12095
INADA Naoki3ae20562017-01-16 20:41:20 +090012096/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090012097str.isascii as unicode_isascii
12098
12099Return True if all characters in the string are ASCII, False otherwise.
12100
12101ASCII characters have code points in the range U+0000-U+007F.
12102Empty string is ASCII too.
12103[clinic start generated code]*/
12104
12105static PyObject *
12106unicode_isascii_impl(PyObject *self)
12107/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
12108{
12109 if (PyUnicode_READY(self) == -1) {
12110 return NULL;
12111 }
12112 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
12113}
12114
12115/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090012116str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012117
INADA Naoki3ae20562017-01-16 20:41:20 +090012118Return True if the string is a lowercase string, False otherwise.
12119
12120A string is lowercase if all cased characters in the string are lowercase and
12121there is at least one cased character in the string.
12122[clinic start generated code]*/
12123
12124static PyObject *
12125unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012126/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012127{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012128 Py_ssize_t i, length;
12129 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012130 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012131 int cased;
12132
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012133 if (PyUnicode_READY(self) == -1)
12134 return NULL;
12135 length = PyUnicode_GET_LENGTH(self);
12136 kind = PyUnicode_KIND(self);
12137 data = PyUnicode_DATA(self);
12138
Guido van Rossumd57fd912000-03-10 22:53:23 +000012139 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012140 if (length == 1)
12141 return PyBool_FromLong(
12142 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012143
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012144 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012145 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012146 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012147
Guido van Rossumd57fd912000-03-10 22:53:23 +000012148 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012149 for (i = 0; i < length; i++) {
12150 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012151
Benjamin Peterson29060642009-01-31 22:14:21 +000012152 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012153 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012154 else if (!cased && Py_UNICODE_ISLOWER(ch))
12155 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012156 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012157 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012158}
12159
INADA Naoki3ae20562017-01-16 20:41:20 +090012160/*[clinic input]
12161str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000012162
INADA Naoki3ae20562017-01-16 20:41:20 +090012163Return True if the string is an uppercase string, False otherwise.
12164
12165A string is uppercase if all cased characters in the string are uppercase and
12166there is at least one cased character in the string.
12167[clinic start generated code]*/
12168
12169static PyObject *
12170unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012171/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012172{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012173 Py_ssize_t i, length;
12174 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012175 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012176 int cased;
12177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012178 if (PyUnicode_READY(self) == -1)
12179 return NULL;
12180 length = PyUnicode_GET_LENGTH(self);
12181 kind = PyUnicode_KIND(self);
12182 data = PyUnicode_DATA(self);
12183
Guido van Rossumd57fd912000-03-10 22:53:23 +000012184 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012185 if (length == 1)
12186 return PyBool_FromLong(
12187 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012188
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012189 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012190 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012191 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012192
Guido van Rossumd57fd912000-03-10 22:53:23 +000012193 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012194 for (i = 0; i < length; i++) {
12195 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012196
Benjamin Peterson29060642009-01-31 22:14:21 +000012197 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012198 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012199 else if (!cased && Py_UNICODE_ISUPPER(ch))
12200 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012201 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012202 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203}
12204
INADA Naoki3ae20562017-01-16 20:41:20 +090012205/*[clinic input]
12206str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000012207
INADA Naoki3ae20562017-01-16 20:41:20 +090012208Return True if the string is a title-cased string, False otherwise.
12209
12210In a title-cased string, upper- and title-case characters may only
12211follow uncased characters and lowercase characters only cased ones.
12212[clinic start generated code]*/
12213
12214static PyObject *
12215unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012216/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012217{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012218 Py_ssize_t i, length;
12219 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012220 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012221 int cased, previous_is_cased;
12222
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012223 if (PyUnicode_READY(self) == -1)
12224 return NULL;
12225 length = PyUnicode_GET_LENGTH(self);
12226 kind = PyUnicode_KIND(self);
12227 data = PyUnicode_DATA(self);
12228
Guido van Rossumd57fd912000-03-10 22:53:23 +000012229 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012230 if (length == 1) {
12231 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12232 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12233 (Py_UNICODE_ISUPPER(ch) != 0));
12234 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012235
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012236 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012237 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012238 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012239
Guido van Rossumd57fd912000-03-10 22:53:23 +000012240 cased = 0;
12241 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012242 for (i = 0; i < length; i++) {
12243 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012244
Benjamin Peterson29060642009-01-31 22:14:21 +000012245 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12246 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012247 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012248 previous_is_cased = 1;
12249 cased = 1;
12250 }
12251 else if (Py_UNICODE_ISLOWER(ch)) {
12252 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012253 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012254 previous_is_cased = 1;
12255 cased = 1;
12256 }
12257 else
12258 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012259 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012260 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012261}
12262
INADA Naoki3ae20562017-01-16 20:41:20 +090012263/*[clinic input]
12264str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012265
INADA Naoki3ae20562017-01-16 20:41:20 +090012266Return True if the string is a whitespace string, False otherwise.
12267
12268A string is whitespace if all characters in the string are whitespace and there
12269is at least one character in the string.
12270[clinic start generated code]*/
12271
12272static PyObject *
12273unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012274/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012275{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012276 Py_ssize_t i, length;
12277 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012278 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012279
12280 if (PyUnicode_READY(self) == -1)
12281 return NULL;
12282 length = PyUnicode_GET_LENGTH(self);
12283 kind = PyUnicode_KIND(self);
12284 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012285
Guido van Rossumd57fd912000-03-10 22:53:23 +000012286 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012287 if (length == 1)
12288 return PyBool_FromLong(
12289 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012290
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012291 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012292 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012293 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012294
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012295 for (i = 0; i < length; i++) {
12296 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012297 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012298 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012299 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012300 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012301}
12302
INADA Naoki3ae20562017-01-16 20:41:20 +090012303/*[clinic input]
12304str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012305
INADA Naoki3ae20562017-01-16 20:41:20 +090012306Return True if the string is an alphabetic string, False otherwise.
12307
12308A string is alphabetic if all characters in the string are alphabetic and there
12309is at least one character in the string.
12310[clinic start generated code]*/
12311
12312static PyObject *
12313unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012314/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012315{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012316 Py_ssize_t i, length;
12317 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012318 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012319
12320 if (PyUnicode_READY(self) == -1)
12321 return NULL;
12322 length = PyUnicode_GET_LENGTH(self);
12323 kind = PyUnicode_KIND(self);
12324 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012325
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012326 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012327 if (length == 1)
12328 return PyBool_FromLong(
12329 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012330
12331 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012332 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012333 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012334
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012335 for (i = 0; i < length; i++) {
12336 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012337 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012338 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012339 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012340}
12341
INADA Naoki3ae20562017-01-16 20:41:20 +090012342/*[clinic input]
12343str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012344
INADA Naoki3ae20562017-01-16 20:41:20 +090012345Return True if the string is an alpha-numeric string, False otherwise.
12346
12347A string is alpha-numeric if all characters in the string are alpha-numeric and
12348there is at least one character in the string.
12349[clinic start generated code]*/
12350
12351static PyObject *
12352unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012353/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012354{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012355 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012356 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012357 Py_ssize_t len, i;
12358
12359 if (PyUnicode_READY(self) == -1)
12360 return NULL;
12361
12362 kind = PyUnicode_KIND(self);
12363 data = PyUnicode_DATA(self);
12364 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012365
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012366 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012367 if (len == 1) {
12368 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12369 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12370 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012371
12372 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012373 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012374 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012375
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012376 for (i = 0; i < len; i++) {
12377 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012378 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012379 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012380 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012381 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012382}
12383
INADA Naoki3ae20562017-01-16 20:41:20 +090012384/*[clinic input]
12385str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012386
INADA Naoki3ae20562017-01-16 20:41:20 +090012387Return True if the string is a decimal string, False otherwise.
12388
12389A string is a decimal string if all characters in the string are decimal and
12390there is at least one character in the string.
12391[clinic start generated code]*/
12392
12393static PyObject *
12394unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012395/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012396{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012397 Py_ssize_t i, length;
12398 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012399 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012400
12401 if (PyUnicode_READY(self) == -1)
12402 return NULL;
12403 length = PyUnicode_GET_LENGTH(self);
12404 kind = PyUnicode_KIND(self);
12405 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012406
Guido van Rossumd57fd912000-03-10 22:53:23 +000012407 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012408 if (length == 1)
12409 return PyBool_FromLong(
12410 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012411
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012412 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012413 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012414 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012415
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012416 for (i = 0; i < length; i++) {
12417 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012418 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012419 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012420 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012421}
12422
INADA Naoki3ae20562017-01-16 20:41:20 +090012423/*[clinic input]
12424str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012425
INADA Naoki3ae20562017-01-16 20:41:20 +090012426Return True if the string is a digit string, False otherwise.
12427
12428A string is a digit string if all characters in the string are digits and there
12429is at least one character in the string.
12430[clinic start generated code]*/
12431
12432static PyObject *
12433unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012434/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012435{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012436 Py_ssize_t i, length;
12437 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012438 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012439
12440 if (PyUnicode_READY(self) == -1)
12441 return NULL;
12442 length = PyUnicode_GET_LENGTH(self);
12443 kind = PyUnicode_KIND(self);
12444 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012445
Guido van Rossumd57fd912000-03-10 22:53:23 +000012446 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012447 if (length == 1) {
12448 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12449 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12450 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012451
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012452 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012453 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012454 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012456 for (i = 0; i < length; i++) {
12457 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012458 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012459 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012460 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012461}
12462
INADA Naoki3ae20562017-01-16 20:41:20 +090012463/*[clinic input]
12464str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012465
INADA Naoki3ae20562017-01-16 20:41:20 +090012466Return True if the string is a numeric string, False otherwise.
12467
12468A string is numeric if all characters in the string are numeric and there is at
12469least one character in the string.
12470[clinic start generated code]*/
12471
12472static PyObject *
12473unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012474/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012475{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012476 Py_ssize_t i, length;
12477 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012478 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012479
12480 if (PyUnicode_READY(self) == -1)
12481 return NULL;
12482 length = PyUnicode_GET_LENGTH(self);
12483 kind = PyUnicode_KIND(self);
12484 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012485
Guido van Rossumd57fd912000-03-10 22:53:23 +000012486 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012487 if (length == 1)
12488 return PyBool_FromLong(
12489 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012490
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012491 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012492 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012493 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012494
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012495 for (i = 0; i < length; i++) {
12496 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012497 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012498 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012499 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012500}
12501
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012502Py_ssize_t
12503_PyUnicode_ScanIdentifier(PyObject *self)
Martin v. Löwis47383402007-08-15 07:32:56 +000012504{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012505 Py_ssize_t i;
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012506 if (PyUnicode_READY(self) == -1)
12507 return -1;
Martin v. Löwis47383402007-08-15 07:32:56 +000012508
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012509 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012510 if (len == 0) {
12511 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012512 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012513 }
12514
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012515 int kind = PyUnicode_KIND(self);
12516 const void *data = PyUnicode_DATA(self);
12517 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Martin v. Löwis47383402007-08-15 07:32:56 +000012518 /* PEP 3131 says that the first character must be in
12519 XID_Start and subsequent characters in XID_Continue,
12520 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012521 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012522 letters, digits, underscore). However, given the current
12523 definition of XID_Start and XID_Continue, it is sufficient
12524 to check just for these, except that _ must be allowed
12525 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012526 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012527 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012528 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012529
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012530 for (i = 1; i < len; i++) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012531 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012532 if (!_PyUnicode_IsXidContinue(ch)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012533 return i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012534 }
12535 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012536 return i;
12537}
12538
12539int
12540PyUnicode_IsIdentifier(PyObject *self)
12541{
12542 if (PyUnicode_IS_READY(self)) {
12543 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12544 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12545 /* an empty string is not a valid identifier */
12546 return len && i == len;
12547 }
12548 else {
Inada Naoki2c4928d2020-06-17 20:09:44 +090012549_Py_COMP_DIAG_PUSH
12550_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012551 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012552 if (len == 0) {
12553 /* an empty string is not a valid identifier */
12554 return 0;
12555 }
12556
12557 const wchar_t *wstr = _PyUnicode_WSTR(self);
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012558 Py_UCS4 ch = wstr[i++];
12559#if SIZEOF_WCHAR_T == 2
12560 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12561 && i < len
12562 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12563 {
12564 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12565 i++;
12566 }
12567#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012568 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12569 return 0;
12570 }
12571
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012572 while (i < len) {
12573 ch = wstr[i++];
12574#if SIZEOF_WCHAR_T == 2
12575 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12576 && i < len
12577 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12578 {
12579 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12580 i++;
12581 }
12582#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012583 if (!_PyUnicode_IsXidContinue(ch)) {
12584 return 0;
12585 }
12586 }
12587 return 1;
Inada Naoki2c4928d2020-06-17 20:09:44 +090012588_Py_COMP_DIAG_POP
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012589 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012590}
12591
INADA Naoki3ae20562017-01-16 20:41:20 +090012592/*[clinic input]
12593str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012594
INADA Naoki3ae20562017-01-16 20:41:20 +090012595Return True if the string is a valid Python identifier, False otherwise.
12596
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012597Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012598such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012599[clinic start generated code]*/
12600
12601static PyObject *
12602unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012603/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012604{
12605 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12606}
12607
INADA Naoki3ae20562017-01-16 20:41:20 +090012608/*[clinic input]
12609str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012610
INADA Naoki3ae20562017-01-16 20:41:20 +090012611Return True if the string is printable, False otherwise.
12612
12613A string is printable if all of its characters are considered printable in
12614repr() or if it is empty.
12615[clinic start generated code]*/
12616
12617static PyObject *
12618unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012619/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012620{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012621 Py_ssize_t i, length;
12622 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012623 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012624
12625 if (PyUnicode_READY(self) == -1)
12626 return NULL;
12627 length = PyUnicode_GET_LENGTH(self);
12628 kind = PyUnicode_KIND(self);
12629 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012630
12631 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012632 if (length == 1)
12633 return PyBool_FromLong(
12634 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012635
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012636 for (i = 0; i < length; i++) {
12637 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012638 Py_RETURN_FALSE;
12639 }
12640 }
12641 Py_RETURN_TRUE;
12642}
12643
INADA Naoki3ae20562017-01-16 20:41:20 +090012644/*[clinic input]
12645str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012646
INADA Naoki3ae20562017-01-16 20:41:20 +090012647 iterable: object
12648 /
12649
12650Concatenate any number of strings.
12651
Martin Panter91a88662017-01-24 00:30:06 +000012652The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012653The result is returned as a new string.
12654
12655Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12656[clinic start generated code]*/
12657
12658static PyObject *
12659unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012660/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012661{
INADA Naoki3ae20562017-01-16 20:41:20 +090012662 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012663}
12664
Martin v. Löwis18e16552006-02-15 17:27:45 +000012665static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012666unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012668 if (PyUnicode_READY(self) == -1)
12669 return -1;
12670 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012671}
12672
INADA Naoki3ae20562017-01-16 20:41:20 +090012673/*[clinic input]
12674str.ljust as unicode_ljust
12675
12676 width: Py_ssize_t
12677 fillchar: Py_UCS4 = ' '
12678 /
12679
12680Return a left-justified string of length width.
12681
12682Padding is done using the specified fill character (default is a space).
12683[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012684
12685static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012686unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12687/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012688{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012689 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012690 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012691
Victor Stinnerc4b49542011-12-11 22:44:26 +010012692 if (PyUnicode_GET_LENGTH(self) >= width)
12693 return unicode_result_unchanged(self);
12694
12695 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012696}
12697
INADA Naoki3ae20562017-01-16 20:41:20 +090012698/*[clinic input]
12699str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012700
INADA Naoki3ae20562017-01-16 20:41:20 +090012701Return a copy of the string converted to lowercase.
12702[clinic start generated code]*/
12703
12704static PyObject *
12705unicode_lower_impl(PyObject *self)
12706/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012707{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012708 if (PyUnicode_READY(self) == -1)
12709 return NULL;
12710 if (PyUnicode_IS_ASCII(self))
12711 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012712 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012713}
12714
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012715#define LEFTSTRIP 0
12716#define RIGHTSTRIP 1
12717#define BOTHSTRIP 2
12718
12719/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012720static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012721
INADA Naoki3ae20562017-01-16 20:41:20 +090012722#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012723
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012724/* externally visible for str.strip(unicode) */
12725PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012726_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012727{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012728 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012729 int kind;
12730 Py_ssize_t i, j, len;
12731 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012732 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012733
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012734 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12735 return NULL;
12736
12737 kind = PyUnicode_KIND(self);
12738 data = PyUnicode_DATA(self);
12739 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012740 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012741 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12742 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012743 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012744
Benjamin Peterson14339b62009-01-31 16:36:08 +000012745 i = 0;
12746 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012747 while (i < len) {
12748 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12749 if (!BLOOM(sepmask, ch))
12750 break;
12751 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12752 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012753 i++;
12754 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012755 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012756
Benjamin Peterson14339b62009-01-31 16:36:08 +000012757 j = len;
12758 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012759 j--;
12760 while (j >= i) {
12761 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12762 if (!BLOOM(sepmask, ch))
12763 break;
12764 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12765 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012766 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012767 }
12768
Benjamin Peterson29060642009-01-31 22:14:21 +000012769 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012770 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012771
Victor Stinner7931d9a2011-11-04 00:22:48 +010012772 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012773}
12774
12775PyObject*
12776PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12777{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012778 const unsigned char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012779 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012780 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012781
Victor Stinnerde636f32011-10-01 03:55:54 +020012782 if (PyUnicode_READY(self) == -1)
12783 return NULL;
12784
Victor Stinner684d5fd2012-05-03 02:32:34 +020012785 length = PyUnicode_GET_LENGTH(self);
12786 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012787
Victor Stinner684d5fd2012-05-03 02:32:34 +020012788 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012789 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012790
Victor Stinnerde636f32011-10-01 03:55:54 +020012791 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012792 PyErr_SetString(PyExc_IndexError, "string index out of range");
12793 return NULL;
12794 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012795 if (start >= length || end < start)
12796 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012797
Victor Stinner684d5fd2012-05-03 02:32:34 +020012798 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012799 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012800 data = PyUnicode_1BYTE_DATA(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012801 return _PyUnicode_FromASCII((const char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012802 }
12803 else {
12804 kind = PyUnicode_KIND(self);
12805 data = PyUnicode_1BYTE_DATA(self);
12806 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012807 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012808 length);
12809 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012810}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012811
12812static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012813do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012814{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012815 Py_ssize_t len, i, j;
12816
12817 if (PyUnicode_READY(self) == -1)
12818 return NULL;
12819
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012820 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012821
Victor Stinnercc7af722013-04-09 22:39:24 +020012822 if (PyUnicode_IS_ASCII(self)) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012823 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Victor Stinnercc7af722013-04-09 22:39:24 +020012824
12825 i = 0;
12826 if (striptype != RIGHTSTRIP) {
12827 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012828 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012829 if (!_Py_ascii_whitespace[ch])
12830 break;
12831 i++;
12832 }
12833 }
12834
12835 j = len;
12836 if (striptype != LEFTSTRIP) {
12837 j--;
12838 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012839 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012840 if (!_Py_ascii_whitespace[ch])
12841 break;
12842 j--;
12843 }
12844 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012845 }
12846 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012847 else {
12848 int kind = PyUnicode_KIND(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012849 const void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012850
Victor Stinnercc7af722013-04-09 22:39:24 +020012851 i = 0;
12852 if (striptype != RIGHTSTRIP) {
12853 while (i < len) {
12854 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12855 if (!Py_UNICODE_ISSPACE(ch))
12856 break;
12857 i++;
12858 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012859 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012860
12861 j = len;
12862 if (striptype != LEFTSTRIP) {
12863 j--;
12864 while (j >= i) {
12865 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12866 if (!Py_UNICODE_ISSPACE(ch))
12867 break;
12868 j--;
12869 }
12870 j++;
12871 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012872 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012873
Victor Stinner7931d9a2011-11-04 00:22:48 +010012874 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012875}
12876
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012877
12878static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012879do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012880{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012881 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012882 if (PyUnicode_Check(sep))
12883 return _PyUnicode_XStrip(self, striptype, sep);
12884 else {
12885 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012886 "%s arg must be None or str",
12887 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012888 return NULL;
12889 }
12890 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012891
Benjamin Peterson14339b62009-01-31 16:36:08 +000012892 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012893}
12894
12895
INADA Naoki3ae20562017-01-16 20:41:20 +090012896/*[clinic input]
12897str.strip as unicode_strip
12898
12899 chars: object = None
12900 /
12901
Zachary Ware09895c22019-10-09 16:09:00 -050012902Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012903
12904If chars is given and not None, remove characters in chars instead.
12905[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012906
12907static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012908unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012909/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012910{
INADA Naoki3ae20562017-01-16 20:41:20 +090012911 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012912}
12913
12914
INADA Naoki3ae20562017-01-16 20:41:20 +090012915/*[clinic input]
12916str.lstrip as unicode_lstrip
12917
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012918 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012919 /
12920
12921Return a copy of the string with leading whitespace removed.
12922
12923If chars is given and not None, remove characters in chars instead.
12924[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012925
12926static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012927unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012928/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012929{
INADA Naoki3ae20562017-01-16 20:41:20 +090012930 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012931}
12932
12933
INADA Naoki3ae20562017-01-16 20:41:20 +090012934/*[clinic input]
12935str.rstrip as unicode_rstrip
12936
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012937 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012938 /
12939
12940Return a copy of the string with trailing whitespace removed.
12941
12942If chars is given and not None, remove characters in chars instead.
12943[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012944
12945static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012946unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012947/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012948{
INADA Naoki3ae20562017-01-16 20:41:20 +090012949 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012950}
12951
12952
Guido van Rossumd57fd912000-03-10 22:53:23 +000012953static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012954unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012955{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012956 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012957 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012958
Serhiy Storchaka05997252013-01-26 12:14:02 +020012959 if (len < 1)
12960 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012961
Victor Stinnerc4b49542011-12-11 22:44:26 +010012962 /* no repeat, return original string */
12963 if (len == 1)
12964 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012965
Benjamin Petersonbac79492012-01-14 13:34:47 -050012966 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012967 return NULL;
12968
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012969 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012970 PyErr_SetString(PyExc_OverflowError,
12971 "repeated string is too long");
12972 return NULL;
12973 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012974 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012975
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012976 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012977 if (!u)
12978 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012979 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012981 if (PyUnicode_GET_LENGTH(str) == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012982 int kind = PyUnicode_KIND(str);
12983 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012984 if (kind == PyUnicode_1BYTE_KIND) {
12985 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012986 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012987 }
12988 else if (kind == PyUnicode_2BYTE_KIND) {
12989 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012990 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012991 ucs2[n] = fill_char;
12992 } else {
12993 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12994 assert(kind == PyUnicode_4BYTE_KIND);
12995 for (n = 0; n < len; ++n)
12996 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012997 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012998 }
12999 else {
13000 /* number of characters copied this far */
13001 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013002 Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013003 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020013004 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013005 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000013006 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013007 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020013008 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013009 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000013010 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000013011 }
13012
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013013 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013014 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013015}
13016
Alexander Belopolsky40018472011-02-26 01:02:56 +000013017PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013018PyUnicode_Replace(PyObject *str,
13019 PyObject *substr,
13020 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000013021 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013022{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013023 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
13024 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013025 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013026 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013027}
13028
INADA Naoki3ae20562017-01-16 20:41:20 +090013029/*[clinic input]
13030str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000013031
INADA Naoki3ae20562017-01-16 20:41:20 +090013032 old: unicode
13033 new: unicode
13034 count: Py_ssize_t = -1
13035 Maximum number of occurrences to replace.
13036 -1 (the default value) means replace all occurrences.
13037 /
13038
13039Return a copy with all occurrences of substring old replaced by new.
13040
13041If the optional argument count is given, only the first count occurrences are
13042replaced.
13043[clinic start generated code]*/
13044
13045static PyObject *
13046unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
13047 Py_ssize_t count)
13048/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013049{
Benjamin Peterson22a29702012-01-02 09:00:30 -060013050 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013051 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090013052 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013053}
13054
sweeneydea81849b2020-04-22 17:05:48 -040013055/*[clinic input]
13056str.removeprefix as unicode_removeprefix
13057
13058 prefix: unicode
13059 /
13060
13061Return a str with the given prefix string removed if present.
13062
13063If the string starts with the prefix string, return string[len(prefix):].
13064Otherwise, return a copy of the original string.
13065[clinic start generated code]*/
13066
13067static PyObject *
13068unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
13069/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
13070{
13071 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
13072 if (match == -1) {
13073 return NULL;
13074 }
13075 if (match) {
13076 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
13077 PyUnicode_GET_LENGTH(self));
13078 }
13079 return unicode_result_unchanged(self);
13080}
13081
13082/*[clinic input]
13083str.removesuffix as unicode_removesuffix
13084
13085 suffix: unicode
13086 /
13087
13088Return a str with the given suffix string removed if present.
13089
13090If the string ends with the suffix string and that suffix is not empty,
13091return string[:-len(suffix)]. Otherwise, return a copy of the original
13092string.
13093[clinic start generated code]*/
13094
13095static PyObject *
13096unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
13097/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
13098{
13099 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
13100 if (match == -1) {
13101 return NULL;
13102 }
13103 if (match) {
13104 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
13105 - PyUnicode_GET_LENGTH(suffix));
13106 }
13107 return unicode_result_unchanged(self);
13108}
13109
Alexander Belopolsky40018472011-02-26 01:02:56 +000013110static PyObject *
13111unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013112{
Walter Dörwald79e913e2007-05-12 11:08:06 +000013113 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013114 Py_ssize_t isize;
13115 Py_ssize_t osize, squote, dquote, i, o;
13116 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020013117 int ikind, okind, unchanged;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013118 const void *idata;
13119 void *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000013120
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013121 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000013122 return NULL;
13123
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013124 isize = PyUnicode_GET_LENGTH(unicode);
13125 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000013126
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013127 /* Compute length of output, quote characters, and
13128 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020013129 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013130 max = 127;
13131 squote = dquote = 0;
13132 ikind = PyUnicode_KIND(unicode);
13133 for (i = 0; i < isize; i++) {
13134 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040013135 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013136 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040013137 case '\'': squote++; break;
13138 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013139 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040013140 incr = 2;
13141 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013142 default:
13143 /* Fast-path ASCII */
13144 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013145 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013146 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013147 ;
13148 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013149 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013150 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013151 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013152 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013153 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013154 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040013155 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013156 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040013157 if (osize > PY_SSIZE_T_MAX - incr) {
13158 PyErr_SetString(PyExc_OverflowError,
13159 "string is too long to generate repr");
13160 return NULL;
13161 }
13162 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013163 }
13164
13165 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020013166 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013167 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020013168 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013169 if (dquote)
13170 /* Both squote and dquote present. Use squote,
13171 and escape them */
13172 osize += squote;
13173 else
13174 quote = '"';
13175 }
Victor Stinner55c08782013-04-14 18:45:39 +020013176 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013177
13178 repr = PyUnicode_New(osize, max);
13179 if (repr == NULL)
13180 return NULL;
13181 okind = PyUnicode_KIND(repr);
13182 odata = PyUnicode_DATA(repr);
13183
13184 PyUnicode_WRITE(okind, odata, 0, quote);
13185 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020013186 if (unchanged) {
13187 _PyUnicode_FastCopyCharacters(repr, 1,
13188 unicode, 0,
13189 isize);
13190 }
13191 else {
13192 for (i = 0, o = 1; i < isize; i++) {
13193 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013194
Victor Stinner55c08782013-04-14 18:45:39 +020013195 /* Escape quotes and backslashes */
13196 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000013197 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013198 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020013199 continue;
13200 }
13201
13202 /* Map special whitespace to '\t', \n', '\r' */
13203 if (ch == '\t') {
13204 PyUnicode_WRITE(okind, odata, o++, '\\');
13205 PyUnicode_WRITE(okind, odata, o++, 't');
13206 }
13207 else if (ch == '\n') {
13208 PyUnicode_WRITE(okind, odata, o++, '\\');
13209 PyUnicode_WRITE(okind, odata, o++, 'n');
13210 }
13211 else if (ch == '\r') {
13212 PyUnicode_WRITE(okind, odata, o++, '\\');
13213 PyUnicode_WRITE(okind, odata, o++, 'r');
13214 }
13215
13216 /* Map non-printable US ASCII to '\xhh' */
13217 else if (ch < ' ' || ch == 0x7F) {
13218 PyUnicode_WRITE(okind, odata, o++, '\\');
13219 PyUnicode_WRITE(okind, odata, o++, 'x');
13220 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13221 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13222 }
13223
13224 /* Copy ASCII characters as-is */
13225 else if (ch < 0x7F) {
13226 PyUnicode_WRITE(okind, odata, o++, ch);
13227 }
13228
13229 /* Non-ASCII characters */
13230 else {
13231 /* Map Unicode whitespace and control characters
13232 (categories Z* and C* except ASCII space)
13233 */
13234 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13235 PyUnicode_WRITE(okind, odata, o++, '\\');
13236 /* Map 8-bit characters to '\xhh' */
13237 if (ch <= 0xff) {
13238 PyUnicode_WRITE(okind, odata, o++, 'x');
13239 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13240 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13241 }
13242 /* Map 16-bit characters to '\uxxxx' */
13243 else if (ch <= 0xffff) {
13244 PyUnicode_WRITE(okind, odata, o++, 'u');
13245 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13246 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13247 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13248 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13249 }
13250 /* Map 21-bit characters to '\U00xxxxxx' */
13251 else {
13252 PyUnicode_WRITE(okind, odata, o++, 'U');
13253 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13254 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13255 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13256 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13257 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13258 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13259 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13260 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13261 }
13262 }
13263 /* Copy characters as-is */
13264 else {
13265 PyUnicode_WRITE(okind, odata, o++, ch);
13266 }
Georg Brandl559e5d72008-06-11 18:37:52 +000013267 }
13268 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000013269 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013270 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020013271 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000013272 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013273}
13274
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013275PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013276 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013277\n\
13278Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080013279such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013280arguments start and end are interpreted as in slice notation.\n\
13281\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013282Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013283
13284static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013285unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013286{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013287 /* initialize variables to prevent gcc warning */
13288 PyObject *substring = NULL;
13289 Py_ssize_t start = 0;
13290 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013291 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013292
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013293 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013294 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013295
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013296 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013297 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013298
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013299 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013300
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013301 if (result == -2)
13302 return NULL;
13303
Christian Heimes217cfd12007-12-02 14:31:20 +000013304 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013305}
13306
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013307PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013308 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013309\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070013310Return the highest index in S where substring sub is found,\n\
13311such that sub is contained within S[start:end]. Optional\n\
13312arguments start and end are interpreted as in slice notation.\n\
13313\n\
13314Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013315
13316static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013317unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013318{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013319 /* initialize variables to prevent gcc warning */
13320 PyObject *substring = NULL;
13321 Py_ssize_t start = 0;
13322 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013323 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013324
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013325 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013326 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013327
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013328 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013329 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013330
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013331 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013332
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013333 if (result == -2)
13334 return NULL;
13335
Guido van Rossumd57fd912000-03-10 22:53:23 +000013336 if (result < 0) {
13337 PyErr_SetString(PyExc_ValueError, "substring not found");
13338 return NULL;
13339 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013340
Christian Heimes217cfd12007-12-02 14:31:20 +000013341 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013342}
13343
INADA Naoki3ae20562017-01-16 20:41:20 +090013344/*[clinic input]
13345str.rjust as unicode_rjust
13346
13347 width: Py_ssize_t
13348 fillchar: Py_UCS4 = ' '
13349 /
13350
13351Return a right-justified string of length width.
13352
13353Padding is done using the specified fill character (default is a space).
13354[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013355
13356static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013357unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13358/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013359{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013360 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013361 return NULL;
13362
Victor Stinnerc4b49542011-12-11 22:44:26 +010013363 if (PyUnicode_GET_LENGTH(self) >= width)
13364 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013365
Victor Stinnerc4b49542011-12-11 22:44:26 +010013366 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013367}
13368
Alexander Belopolsky40018472011-02-26 01:02:56 +000013369PyObject *
13370PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013371{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013372 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013373 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013374
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013375 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013376}
13377
INADA Naoki3ae20562017-01-16 20:41:20 +090013378/*[clinic input]
13379str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013380
INADA Naoki3ae20562017-01-16 20:41:20 +090013381 sep: object = None
13382 The delimiter according which to split the string.
13383 None (the default value) means split according to any whitespace,
13384 and discard empty strings from the result.
13385 maxsplit: Py_ssize_t = -1
13386 Maximum number of splits to do.
13387 -1 (the default value) means no limit.
13388
13389Return a list of the words in the string, using sep as the delimiter string.
13390[clinic start generated code]*/
13391
13392static PyObject *
13393unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13394/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013395{
INADA Naoki3ae20562017-01-16 20:41:20 +090013396 if (sep == Py_None)
13397 return split(self, NULL, maxsplit);
13398 if (PyUnicode_Check(sep))
13399 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013400
Victor Stinner998b8062018-09-12 00:23:25 +020013401 PyErr_Format(PyExc_TypeError,
13402 "must be str or None, not %.100s",
13403 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013404 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013405}
13406
Thomas Wouters477c8d52006-05-27 19:21:47 +000013407PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013408PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013409{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013410 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013411 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013412 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013413 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013414
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013415 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013416 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013417
Victor Stinner14f8f022011-10-05 20:58:25 +020013418 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013419 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013420 len1 = PyUnicode_GET_LENGTH(str_obj);
13421 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013422 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013423 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013424 return PyTuple_Pack(3, str_obj, empty, empty);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013425 }
13426 buf1 = PyUnicode_DATA(str_obj);
13427 buf2 = PyUnicode_DATA(sep_obj);
13428 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013429 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013430 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013431 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013432 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013433
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013434 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013435 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013436 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13437 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13438 else
13439 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013440 break;
13441 case PyUnicode_2BYTE_KIND:
13442 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13443 break;
13444 case PyUnicode_4BYTE_KIND:
13445 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13446 break;
13447 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013448 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013449 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013450
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013451 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013452 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013453 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013454
13455 return out;
13456}
13457
13458
13459PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013460PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013461{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013462 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013463 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013464 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013465 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013466
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013467 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013468 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013469
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013470 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013471 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013472 len1 = PyUnicode_GET_LENGTH(str_obj);
13473 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013474 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013475 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013476 return PyTuple_Pack(3, empty, empty, str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013477 }
13478 buf1 = PyUnicode_DATA(str_obj);
13479 buf2 = PyUnicode_DATA(sep_obj);
13480 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013481 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013482 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013483 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013484 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013485
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013486 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013487 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013488 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13489 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13490 else
13491 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013492 break;
13493 case PyUnicode_2BYTE_KIND:
13494 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13495 break;
13496 case PyUnicode_4BYTE_KIND:
13497 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13498 break;
13499 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013500 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013501 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013502
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013503 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013504 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013505 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013506
13507 return out;
13508}
13509
INADA Naoki3ae20562017-01-16 20:41:20 +090013510/*[clinic input]
13511str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013512
INADA Naoki3ae20562017-01-16 20:41:20 +090013513 sep: object
13514 /
13515
13516Partition the string into three parts using the given separator.
13517
13518This will search for the separator in the string. If the separator is found,
13519returns a 3-tuple containing the part before the separator, the separator
13520itself, and the part after it.
13521
13522If the separator is not found, returns a 3-tuple containing the original string
13523and two empty strings.
13524[clinic start generated code]*/
13525
13526static PyObject *
13527unicode_partition(PyObject *self, PyObject *sep)
13528/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013529{
INADA Naoki3ae20562017-01-16 20:41:20 +090013530 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013531}
13532
INADA Naoki3ae20562017-01-16 20:41:20 +090013533/*[clinic input]
13534str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013535
INADA Naoki3ae20562017-01-16 20:41:20 +090013536Partition the string into three parts using the given separator.
13537
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013538This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013539the separator is found, returns a 3-tuple containing the part before the
13540separator, the separator itself, and the part after it.
13541
13542If the separator is not found, returns a 3-tuple containing two empty strings
13543and the original string.
13544[clinic start generated code]*/
13545
13546static PyObject *
13547unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013548/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013549{
INADA Naoki3ae20562017-01-16 20:41:20 +090013550 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013551}
13552
Alexander Belopolsky40018472011-02-26 01:02:56 +000013553PyObject *
13554PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013555{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013556 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013557 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013558
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013559 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013560}
13561
INADA Naoki3ae20562017-01-16 20:41:20 +090013562/*[clinic input]
13563str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013564
INADA Naoki3ae20562017-01-16 20:41:20 +090013565Return a list of the words in the string, using sep as the delimiter string.
13566
13567Splits are done starting at the end of the string and working to the front.
13568[clinic start generated code]*/
13569
13570static PyObject *
13571unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13572/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013573{
INADA Naoki3ae20562017-01-16 20:41:20 +090013574 if (sep == Py_None)
13575 return rsplit(self, NULL, maxsplit);
13576 if (PyUnicode_Check(sep))
13577 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013578
Victor Stinner998b8062018-09-12 00:23:25 +020013579 PyErr_Format(PyExc_TypeError,
13580 "must be str or None, not %.100s",
13581 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013582 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013583}
13584
INADA Naoki3ae20562017-01-16 20:41:20 +090013585/*[clinic input]
13586str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013587
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013588 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013589
13590Return a list of the lines in the string, breaking at line boundaries.
13591
13592Line breaks are not included in the resulting list unless keepends is given and
13593true.
13594[clinic start generated code]*/
13595
13596static PyObject *
13597unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013598/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013599{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013600 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013601}
13602
13603static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013604PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013605{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013606 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013607}
13608
INADA Naoki3ae20562017-01-16 20:41:20 +090013609/*[clinic input]
13610str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013611
INADA Naoki3ae20562017-01-16 20:41:20 +090013612Convert uppercase characters to lowercase and lowercase characters to uppercase.
13613[clinic start generated code]*/
13614
13615static PyObject *
13616unicode_swapcase_impl(PyObject *self)
13617/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013618{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013619 if (PyUnicode_READY(self) == -1)
13620 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013621 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013622}
13623
Larry Hastings61272b72014-01-07 12:41:53 -080013624/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013625
Larry Hastings31826802013-10-19 00:09:25 -070013626@staticmethod
13627str.maketrans as unicode_maketrans
13628
13629 x: object
13630
13631 y: unicode=NULL
13632
13633 z: unicode=NULL
13634
13635 /
13636
13637Return a translation table usable for str.translate().
13638
13639If there is only one argument, it must be a dictionary mapping Unicode
13640ordinals (integers) or characters to Unicode ordinals, strings or None.
13641Character keys will be then converted to ordinals.
13642If there are two arguments, they must be strings of equal length, and
13643in the resulting dictionary, each character in x will be mapped to the
13644character at the same position in y. If there is a third argument, it
13645must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013646[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013647
Larry Hastings31826802013-10-19 00:09:25 -070013648static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013649unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013650/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013651{
Georg Brandlceee0772007-11-27 23:48:05 +000013652 PyObject *new = NULL, *key, *value;
13653 Py_ssize_t i = 0;
13654 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013655
Georg Brandlceee0772007-11-27 23:48:05 +000013656 new = PyDict_New();
13657 if (!new)
13658 return NULL;
13659 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013660 int x_kind, y_kind, z_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013661 const void *x_data, *y_data, *z_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013662
Georg Brandlceee0772007-11-27 23:48:05 +000013663 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013664 if (!PyUnicode_Check(x)) {
13665 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13666 "be a string if there is a second argument");
13667 goto err;
13668 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013669 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013670 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13671 "arguments must have equal length");
13672 goto err;
13673 }
13674 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013675 x_kind = PyUnicode_KIND(x);
13676 y_kind = PyUnicode_KIND(y);
13677 x_data = PyUnicode_DATA(x);
13678 y_data = PyUnicode_DATA(y);
13679 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13680 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013681 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013682 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013683 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013684 if (!value) {
13685 Py_DECREF(key);
13686 goto err;
13687 }
Georg Brandlceee0772007-11-27 23:48:05 +000013688 res = PyDict_SetItem(new, key, value);
13689 Py_DECREF(key);
13690 Py_DECREF(value);
13691 if (res < 0)
13692 goto err;
13693 }
13694 /* create entries for deleting chars in z */
13695 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013696 z_kind = PyUnicode_KIND(z);
13697 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013698 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013699 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013700 if (!key)
13701 goto err;
13702 res = PyDict_SetItem(new, key, Py_None);
13703 Py_DECREF(key);
13704 if (res < 0)
13705 goto err;
13706 }
13707 }
13708 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013709 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013710 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013711
Georg Brandlceee0772007-11-27 23:48:05 +000013712 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013713 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013714 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13715 "to maketrans it must be a dict");
13716 goto err;
13717 }
13718 /* copy entries into the new dict, converting string keys to int keys */
13719 while (PyDict_Next(x, &i, &key, &value)) {
13720 if (PyUnicode_Check(key)) {
13721 /* convert string keys to integer keys */
13722 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013723 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013724 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13725 "table must be of length 1");
13726 goto err;
13727 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013728 kind = PyUnicode_KIND(key);
13729 data = PyUnicode_DATA(key);
13730 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013731 if (!newkey)
13732 goto err;
13733 res = PyDict_SetItem(new, newkey, value);
13734 Py_DECREF(newkey);
13735 if (res < 0)
13736 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013737 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013738 /* just keep integer keys */
13739 if (PyDict_SetItem(new, key, value) < 0)
13740 goto err;
13741 } else {
13742 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13743 "be strings or integers");
13744 goto err;
13745 }
13746 }
13747 }
13748 return new;
13749 err:
13750 Py_DECREF(new);
13751 return NULL;
13752}
13753
INADA Naoki3ae20562017-01-16 20:41:20 +090013754/*[clinic input]
13755str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013756
INADA Naoki3ae20562017-01-16 20:41:20 +090013757 table: object
13758 Translation table, which must be a mapping of Unicode ordinals to
13759 Unicode ordinals, strings, or None.
13760 /
13761
13762Replace each character in the string using the given translation table.
13763
13764The table must implement lookup/indexing via __getitem__, for instance a
13765dictionary or list. If this operation raises LookupError, the character is
13766left untouched. Characters mapped to None are deleted.
13767[clinic start generated code]*/
13768
13769static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013770unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013771/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013772{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013773 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013774}
13775
INADA Naoki3ae20562017-01-16 20:41:20 +090013776/*[clinic input]
13777str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013778
INADA Naoki3ae20562017-01-16 20:41:20 +090013779Return a copy of the string converted to uppercase.
13780[clinic start generated code]*/
13781
13782static PyObject *
13783unicode_upper_impl(PyObject *self)
13784/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013785{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013786 if (PyUnicode_READY(self) == -1)
13787 return NULL;
13788 if (PyUnicode_IS_ASCII(self))
13789 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013790 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013791}
13792
INADA Naoki3ae20562017-01-16 20:41:20 +090013793/*[clinic input]
13794str.zfill as unicode_zfill
13795
13796 width: Py_ssize_t
13797 /
13798
13799Pad a numeric string with zeros on the left, to fill a field of the given width.
13800
13801The string is never truncated.
13802[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013803
13804static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013805unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013806/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013807{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013808 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013809 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013810 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013811 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013812 Py_UCS4 chr;
13813
Benjamin Petersonbac79492012-01-14 13:34:47 -050013814 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013815 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013816
Victor Stinnerc4b49542011-12-11 22:44:26 +010013817 if (PyUnicode_GET_LENGTH(self) >= width)
13818 return unicode_result_unchanged(self);
13819
13820 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013821
13822 u = pad(self, fill, 0, '0');
13823
Walter Dörwald068325e2002-04-15 13:36:47 +000013824 if (u == NULL)
13825 return NULL;
13826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013827 kind = PyUnicode_KIND(u);
13828 data = PyUnicode_DATA(u);
13829 chr = PyUnicode_READ(kind, data, fill);
13830
13831 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013832 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013833 PyUnicode_WRITE(kind, data, 0, chr);
13834 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013835 }
13836
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013837 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013838 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013839}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013840
13841#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013842static PyObject *
13843unicode__decimal2ascii(PyObject *self)
13844{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013845 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013846}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013847#endif
13848
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013849PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013850 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013851\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013852Return True if S starts with the specified prefix, False otherwise.\n\
13853With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013854With optional end, stop comparing S at that position.\n\
13855prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013856
13857static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013858unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013859 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013860{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013861 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013862 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013863 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013864 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013865 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013866
Jesus Ceaac451502011-04-20 17:09:23 +020013867 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013868 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013869 if (PyTuple_Check(subobj)) {
13870 Py_ssize_t i;
13871 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013872 substring = PyTuple_GET_ITEM(subobj, i);
13873 if (!PyUnicode_Check(substring)) {
13874 PyErr_Format(PyExc_TypeError,
13875 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013876 "not %.100s",
13877 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013878 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013879 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013880 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013881 if (result == -1)
13882 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013883 if (result) {
13884 Py_RETURN_TRUE;
13885 }
13886 }
13887 /* nothing matched */
13888 Py_RETURN_FALSE;
13889 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013890 if (!PyUnicode_Check(subobj)) {
13891 PyErr_Format(PyExc_TypeError,
13892 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013893 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013894 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013895 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013896 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013897 if (result == -1)
13898 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013899 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013900}
13901
13902
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013903PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013904 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013905\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013906Return True if S ends with the specified suffix, False otherwise.\n\
13907With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013908With optional end, stop comparing S at that position.\n\
13909suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013910
13911static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013912unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013913 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013914{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013915 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013916 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013917 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013918 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013919 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013920
Jesus Ceaac451502011-04-20 17:09:23 +020013921 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013922 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013923 if (PyTuple_Check(subobj)) {
13924 Py_ssize_t i;
13925 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013926 substring = PyTuple_GET_ITEM(subobj, i);
13927 if (!PyUnicode_Check(substring)) {
13928 PyErr_Format(PyExc_TypeError,
13929 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013930 "not %.100s",
13931 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013932 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013933 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013934 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013935 if (result == -1)
13936 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013937 if (result) {
13938 Py_RETURN_TRUE;
13939 }
13940 }
13941 Py_RETURN_FALSE;
13942 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013943 if (!PyUnicode_Check(subobj)) {
13944 PyErr_Format(PyExc_TypeError,
13945 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013946 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013947 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013948 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013949 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013950 if (result == -1)
13951 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013952 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013953}
13954
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013955static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013956_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013957{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013958 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13959 writer->data = PyUnicode_DATA(writer->buffer);
13960
13961 if (!writer->readonly) {
13962 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013963 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013964 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013965 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013966 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13967 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13968 writer->kind = PyUnicode_WCHAR_KIND;
13969 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13970
Victor Stinner8f674cc2013-04-17 23:02:17 +020013971 /* Copy-on-write mode: set buffer size to 0 so
13972 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13973 * next write. */
13974 writer->size = 0;
13975 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013976}
13977
Victor Stinnerd3f08822012-05-29 12:57:52 +020013978void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013979_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013980{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013981 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013982
13983 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013984 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013985
13986 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13987 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13988 writer->kind = PyUnicode_WCHAR_KIND;
13989 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013990}
13991
Inada Naoki770847a2019-06-24 12:30:24 +090013992// Initialize _PyUnicodeWriter with initial buffer
13993static inline void
13994_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13995{
13996 memset(writer, 0, sizeof(*writer));
13997 writer->buffer = buffer;
13998 _PyUnicodeWriter_Update(writer);
13999 writer->min_length = writer->size;
14000}
14001
Victor Stinnerd3f08822012-05-29 12:57:52 +020014002int
14003_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
14004 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020014005{
14006 Py_ssize_t newlen;
14007 PyObject *newbuffer;
14008
Victor Stinner2740e462016-09-06 16:58:36 -070014009 assert(maxchar <= MAX_UNICODE);
14010
Victor Stinnerca9381e2015-09-22 00:58:32 +020014011 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020014012 assert((maxchar > writer->maxchar && length >= 0)
14013 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014014
Victor Stinner202fdca2012-05-07 12:47:02 +020014015 if (length > PY_SSIZE_T_MAX - writer->pos) {
14016 PyErr_NoMemory();
14017 return -1;
14018 }
14019 newlen = writer->pos + length;
14020
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014021 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020014022
Victor Stinnerd3f08822012-05-29 12:57:52 +020014023 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020014024 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010014025 if (writer->overallocate
14026 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
14027 /* overallocate to limit the number of realloc() */
14028 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014029 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014030 if (newlen < writer->min_length)
14031 newlen = writer->min_length;
14032
Victor Stinnerd3f08822012-05-29 12:57:52 +020014033 writer->buffer = PyUnicode_New(newlen, maxchar);
14034 if (writer->buffer == NULL)
14035 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014036 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014037 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010014038 if (writer->overallocate
14039 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
14040 /* overallocate to limit the number of realloc() */
14041 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014042 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014043 if (newlen < writer->min_length)
14044 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014045
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014046 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020014047 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030014048 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020014049 newbuffer = PyUnicode_New(newlen, maxchar);
14050 if (newbuffer == NULL)
14051 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014052 _PyUnicode_FastCopyCharacters(newbuffer, 0,
14053 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020014054 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014055 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020014056 }
14057 else {
14058 newbuffer = resize_compact(writer->buffer, newlen);
14059 if (newbuffer == NULL)
14060 return -1;
14061 }
14062 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020014063 }
14064 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014065 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014066 newbuffer = PyUnicode_New(writer->size, maxchar);
14067 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020014068 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014069 _PyUnicode_FastCopyCharacters(newbuffer, 0,
14070 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030014071 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020014072 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014073 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020014074 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010014075
14076#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020014077}
14078
Victor Stinnerca9381e2015-09-22 00:58:32 +020014079int
14080_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
14081 enum PyUnicode_Kind kind)
14082{
14083 Py_UCS4 maxchar;
14084
14085 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
14086 assert(writer->kind < kind);
14087
14088 switch (kind)
14089 {
14090 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
14091 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
14092 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
14093 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014094 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020014095 }
14096
14097 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
14098}
14099
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070014100static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014101_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020014102{
Victor Stinner2740e462016-09-06 16:58:36 -070014103 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020014104 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
14105 return -1;
14106 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
14107 writer->pos++;
14108 return 0;
14109}
14110
14111int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014112_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
14113{
14114 return _PyUnicodeWriter_WriteCharInline(writer, ch);
14115}
14116
14117int
Victor Stinnerd3f08822012-05-29 12:57:52 +020014118_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
14119{
14120 Py_UCS4 maxchar;
14121 Py_ssize_t len;
14122
14123 if (PyUnicode_READY(str) == -1)
14124 return -1;
14125 len = PyUnicode_GET_LENGTH(str);
14126 if (len == 0)
14127 return 0;
14128 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
14129 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014130 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010014131 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020014132 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014133 Py_INCREF(str);
14134 writer->buffer = str;
14135 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014136 writer->pos += len;
14137 return 0;
14138 }
14139 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
14140 return -1;
14141 }
14142 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14143 str, 0, len);
14144 writer->pos += len;
14145 return 0;
14146}
14147
Victor Stinnere215d962012-10-06 23:03:36 +020014148int
Victor Stinnercfc4c132013-04-03 01:48:39 +020014149_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
14150 Py_ssize_t start, Py_ssize_t end)
14151{
14152 Py_UCS4 maxchar;
14153 Py_ssize_t len;
14154
14155 if (PyUnicode_READY(str) == -1)
14156 return -1;
14157
14158 assert(0 <= start);
14159 assert(end <= PyUnicode_GET_LENGTH(str));
14160 assert(start <= end);
14161
14162 if (end == 0)
14163 return 0;
14164
14165 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
14166 return _PyUnicodeWriter_WriteStr(writer, str);
14167
14168 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
14169 maxchar = _PyUnicode_FindMaxChar(str, start, end);
14170 else
14171 maxchar = writer->maxchar;
14172 len = end - start;
14173
14174 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
14175 return -1;
14176
14177 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14178 str, start, len);
14179 writer->pos += len;
14180 return 0;
14181}
14182
14183int
Victor Stinner4a587072013-11-19 12:54:53 +010014184_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14185 const char *ascii, Py_ssize_t len)
14186{
14187 if (len == -1)
14188 len = strlen(ascii);
14189
Andy Lestere6be9b52020-02-11 20:28:35 -060014190 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010014191
14192 if (writer->buffer == NULL && !writer->overallocate) {
14193 PyObject *str;
14194
14195 str = _PyUnicode_FromASCII(ascii, len);
14196 if (str == NULL)
14197 return -1;
14198
14199 writer->readonly = 1;
14200 writer->buffer = str;
14201 _PyUnicodeWriter_Update(writer);
14202 writer->pos += len;
14203 return 0;
14204 }
14205
14206 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14207 return -1;
14208
14209 switch (writer->kind)
14210 {
14211 case PyUnicode_1BYTE_KIND:
14212 {
14213 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14214 Py_UCS1 *data = writer->data;
14215
Christian Heimesf051e432016-09-13 20:22:02 +020014216 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010014217 break;
14218 }
14219 case PyUnicode_2BYTE_KIND:
14220 {
14221 _PyUnicode_CONVERT_BYTES(
14222 Py_UCS1, Py_UCS2,
14223 ascii, ascii + len,
14224 (Py_UCS2 *)writer->data + writer->pos);
14225 break;
14226 }
14227 case PyUnicode_4BYTE_KIND:
14228 {
14229 _PyUnicode_CONVERT_BYTES(
14230 Py_UCS1, Py_UCS4,
14231 ascii, ascii + len,
14232 (Py_UCS4 *)writer->data + writer->pos);
14233 break;
14234 }
14235 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014236 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010014237 }
14238
14239 writer->pos += len;
14240 return 0;
14241}
14242
14243int
14244_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14245 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020014246{
14247 Py_UCS4 maxchar;
14248
Andy Lestere6be9b52020-02-11 20:28:35 -060014249 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020014250 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14251 return -1;
14252 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14253 writer->pos += len;
14254 return 0;
14255}
14256
Victor Stinnerd3f08822012-05-29 12:57:52 +020014257PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014258_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014259{
Victor Stinner15a0bd32013-07-08 22:29:55 +020014260 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014261
Victor Stinnerd3f08822012-05-29 12:57:52 +020014262 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014263 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020014264 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020014265 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014266
14267 str = writer->buffer;
14268 writer->buffer = NULL;
14269
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014270 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014271 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14272 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014273 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014274
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014275 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14276 PyObject *str2;
14277 str2 = resize_compact(str, writer->pos);
14278 if (str2 == NULL) {
14279 Py_DECREF(str);
14280 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014281 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014282 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014283 }
14284
Victor Stinner15a0bd32013-07-08 22:29:55 +020014285 assert(_PyUnicode_CheckConsistency(str, 1));
14286 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020014287}
14288
Victor Stinnerd3f08822012-05-29 12:57:52 +020014289void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014290_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014291{
14292 Py_CLEAR(writer->buffer);
14293}
14294
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014295#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000014296
14297PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000014298 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000014299\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014300Return a formatted version of S, using substitutions from args and kwargs.\n\
14301The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000014302
Eric Smith27bbca62010-11-04 17:06:58 +000014303PyDoc_STRVAR(format_map__doc__,
14304 "S.format_map(mapping) -> str\n\
14305\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014306Return a formatted version of S, using substitutions from mapping.\n\
14307The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000014308
INADA Naoki3ae20562017-01-16 20:41:20 +090014309/*[clinic input]
14310str.__format__ as unicode___format__
14311
14312 format_spec: unicode
14313 /
14314
14315Return a formatted version of the string as described by format_spec.
14316[clinic start generated code]*/
14317
Eric Smith4a7d76d2008-05-30 18:10:19 +000014318static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014319unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090014320/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000014321{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014322 _PyUnicodeWriter writer;
14323 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000014324
Victor Stinnerd3f08822012-05-29 12:57:52 +020014325 if (PyUnicode_READY(self) == -1)
14326 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020014327 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014328 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14329 self, format_spec, 0,
14330 PyUnicode_GET_LENGTH(format_spec));
14331 if (ret == -1) {
14332 _PyUnicodeWriter_Dealloc(&writer);
14333 return NULL;
14334 }
14335 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000014336}
14337
INADA Naoki3ae20562017-01-16 20:41:20 +090014338/*[clinic input]
14339str.__sizeof__ as unicode_sizeof
14340
14341Return the size of the string in memory, in bytes.
14342[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014343
14344static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014345unicode_sizeof_impl(PyObject *self)
14346/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014347{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014348 Py_ssize_t size;
14349
14350 /* If it's a compact object, account for base structure +
14351 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014352 if (PyUnicode_IS_COMPACT_ASCII(self))
14353 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14354 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014355 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014356 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014357 else {
14358 /* If it is a two-block object, account for base object, and
14359 for character block if present. */
14360 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014361 if (_PyUnicode_DATA_ANY(self))
14362 size += (PyUnicode_GET_LENGTH(self) + 1) *
14363 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014364 }
14365 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014366 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014367 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14368 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14369 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14370 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014371
14372 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014373}
14374
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014375static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014376unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014377{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014378 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014379 if (!copy)
14380 return NULL;
14381 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014382}
14383
Guido van Rossumd57fd912000-03-10 22:53:23 +000014384static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014385 UNICODE_ENCODE_METHODDEF
14386 UNICODE_REPLACE_METHODDEF
14387 UNICODE_SPLIT_METHODDEF
14388 UNICODE_RSPLIT_METHODDEF
14389 UNICODE_JOIN_METHODDEF
14390 UNICODE_CAPITALIZE_METHODDEF
14391 UNICODE_CASEFOLD_METHODDEF
14392 UNICODE_TITLE_METHODDEF
14393 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014394 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014395 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014396 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014397 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014398 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014399 UNICODE_LJUST_METHODDEF
14400 UNICODE_LOWER_METHODDEF
14401 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014402 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14403 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014404 UNICODE_RJUST_METHODDEF
14405 UNICODE_RSTRIP_METHODDEF
14406 UNICODE_RPARTITION_METHODDEF
14407 UNICODE_SPLITLINES_METHODDEF
14408 UNICODE_STRIP_METHODDEF
14409 UNICODE_SWAPCASE_METHODDEF
14410 UNICODE_TRANSLATE_METHODDEF
14411 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014412 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14413 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
sweeneydea81849b2020-04-22 17:05:48 -040014414 UNICODE_REMOVEPREFIX_METHODDEF
14415 UNICODE_REMOVESUFFIX_METHODDEF
INADA Naokia49ac992018-01-27 14:06:21 +090014416 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014417 UNICODE_ISLOWER_METHODDEF
14418 UNICODE_ISUPPER_METHODDEF
14419 UNICODE_ISTITLE_METHODDEF
14420 UNICODE_ISSPACE_METHODDEF
14421 UNICODE_ISDECIMAL_METHODDEF
14422 UNICODE_ISDIGIT_METHODDEF
14423 UNICODE_ISNUMERIC_METHODDEF
14424 UNICODE_ISALPHA_METHODDEF
14425 UNICODE_ISALNUM_METHODDEF
14426 UNICODE_ISIDENTIFIER_METHODDEF
14427 UNICODE_ISPRINTABLE_METHODDEF
14428 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014429 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014430 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014431 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014432 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014433 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014434#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014435 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014436 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014437#endif
14438
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014439 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014440 {NULL, NULL}
14441};
14442
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014443static PyObject *
14444unicode_mod(PyObject *v, PyObject *w)
14445{
Brian Curtindfc80e32011-08-10 20:28:54 -050014446 if (!PyUnicode_Check(v))
14447 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014448 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014449}
14450
14451static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014452 0, /*nb_add*/
14453 0, /*nb_subtract*/
14454 0, /*nb_multiply*/
14455 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014456};
14457
Guido van Rossumd57fd912000-03-10 22:53:23 +000014458static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014459 (lenfunc) unicode_length, /* sq_length */
14460 PyUnicode_Concat, /* sq_concat */
14461 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14462 (ssizeargfunc) unicode_getitem, /* sq_item */
14463 0, /* sq_slice */
14464 0, /* sq_ass_item */
14465 0, /* sq_ass_slice */
14466 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014467};
14468
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014469static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014470unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014471{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014472 if (PyUnicode_READY(self) == -1)
14473 return NULL;
14474
Victor Stinnera15e2602020-04-08 02:01:56 +020014475 if (_PyIndex_Check(item)) {
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014476 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014477 if (i == -1 && PyErr_Occurred())
14478 return NULL;
14479 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014480 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014481 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014482 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014483 Py_ssize_t start, stop, step, slicelength, i;
14484 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014485 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014486 const void *src_data;
14487 void *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014488 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014489 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014490
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014491 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014492 return NULL;
14493 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014494 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14495 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014496
14497 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014498 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014499 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014500 slicelength == PyUnicode_GET_LENGTH(self)) {
14501 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014502 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014503 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014504 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014505 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014506 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014507 src_kind = PyUnicode_KIND(self);
14508 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014509 if (!PyUnicode_IS_ASCII(self)) {
14510 kind_limit = kind_maxchar_limit(src_kind);
14511 max_char = 0;
14512 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14513 ch = PyUnicode_READ(src_kind, src_data, cur);
14514 if (ch > max_char) {
14515 max_char = ch;
14516 if (max_char >= kind_limit)
14517 break;
14518 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014519 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014520 }
Victor Stinner55c99112011-10-13 01:17:06 +020014521 else
14522 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014523 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014524 if (result == NULL)
14525 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014526 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014527 dest_data = PyUnicode_DATA(result);
14528
14529 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014530 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14531 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014532 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014533 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014534 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014535 } else {
14536 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14537 return NULL;
14538 }
14539}
14540
14541static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014542 (lenfunc)unicode_length, /* mp_length */
14543 (binaryfunc)unicode_subscript, /* mp_subscript */
14544 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014545};
14546
Guido van Rossumd57fd912000-03-10 22:53:23 +000014547
Guido van Rossumd57fd912000-03-10 22:53:23 +000014548/* Helpers for PyUnicode_Format() */
14549
Victor Stinnera47082312012-10-04 02:19:54 +020014550struct unicode_formatter_t {
14551 PyObject *args;
14552 int args_owned;
14553 Py_ssize_t arglen, argidx;
14554 PyObject *dict;
14555
14556 enum PyUnicode_Kind fmtkind;
14557 Py_ssize_t fmtcnt, fmtpos;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014558 const void *fmtdata;
Victor Stinnera47082312012-10-04 02:19:54 +020014559 PyObject *fmtstr;
14560
14561 _PyUnicodeWriter writer;
14562};
14563
14564struct unicode_format_arg_t {
14565 Py_UCS4 ch;
14566 int flags;
14567 Py_ssize_t width;
14568 int prec;
14569 int sign;
14570};
14571
Guido van Rossumd57fd912000-03-10 22:53:23 +000014572static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014573unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014574{
Victor Stinnera47082312012-10-04 02:19:54 +020014575 Py_ssize_t argidx = ctx->argidx;
14576
14577 if (argidx < ctx->arglen) {
14578 ctx->argidx++;
14579 if (ctx->arglen < 0)
14580 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014581 else
Victor Stinnera47082312012-10-04 02:19:54 +020014582 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014583 }
14584 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014585 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014586 return NULL;
14587}
14588
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014589/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014590
Victor Stinnera47082312012-10-04 02:19:54 +020014591/* Format a float into the writer if the writer is not NULL, or into *p_output
14592 otherwise.
14593
14594 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014595static int
Victor Stinnera47082312012-10-04 02:19:54 +020014596formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14597 PyObject **p_output,
14598 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014599{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014600 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014601 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014602 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014603 int prec;
14604 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014605
Guido van Rossumd57fd912000-03-10 22:53:23 +000014606 x = PyFloat_AsDouble(v);
14607 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014608 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014609
Victor Stinnera47082312012-10-04 02:19:54 +020014610 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014611 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014612 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014613
Victor Stinnera47082312012-10-04 02:19:54 +020014614 if (arg->flags & F_ALT)
14615 dtoa_flags = Py_DTSF_ALT;
14616 else
14617 dtoa_flags = 0;
14618 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014619 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014620 return -1;
14621 len = strlen(p);
14622 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014623 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014624 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014625 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014626 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014627 }
14628 else
14629 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014630 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014631 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014632}
14633
Victor Stinnerd0880d52012-04-27 23:40:13 +020014634/* formatlong() emulates the format codes d, u, o, x and X, and
14635 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14636 * Python's regular ints.
14637 * Return value: a new PyUnicodeObject*, or NULL if error.
14638 * The output string is of the form
14639 * "-"? ("0x" | "0X")? digit+
14640 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14641 * set in flags. The case of hex digits will be correct,
14642 * There will be at least prec digits, zero-filled on the left if
14643 * necessary to get that many.
14644 * val object to be converted
14645 * flags bitmask of format flags; only F_ALT is looked at
14646 * prec minimum number of digits; 0-fill on left if needed
14647 * type a character in [duoxX]; u acts the same as d
14648 *
14649 * CAUTION: o, x and X conversions on regular ints can never
14650 * produce a '-' sign, but can for Python's unbounded ints.
14651 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014652PyObject *
14653_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014654{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014655 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014656 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014657 Py_ssize_t i;
14658 int sign; /* 1 if '-', else 0 */
14659 int len; /* number of characters */
14660 Py_ssize_t llen;
14661 int numdigits; /* len == numnondigits + numdigits */
14662 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014663
Victor Stinnerd0880d52012-04-27 23:40:13 +020014664 /* Avoid exceeding SSIZE_T_MAX */
14665 if (prec > INT_MAX-3) {
14666 PyErr_SetString(PyExc_OverflowError,
14667 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014668 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014669 }
14670
14671 assert(PyLong_Check(val));
14672
14673 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014674 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014675 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014676 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014677 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014678 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014679 /* int and int subclasses should print numerically when a numeric */
14680 /* format code is used (see issue18780) */
14681 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014682 break;
14683 case 'o':
14684 numnondigits = 2;
14685 result = PyNumber_ToBase(val, 8);
14686 break;
14687 case 'x':
14688 case 'X':
14689 numnondigits = 2;
14690 result = PyNumber_ToBase(val, 16);
14691 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014692 }
14693 if (!result)
14694 return NULL;
14695
14696 assert(unicode_modifiable(result));
14697 assert(PyUnicode_IS_READY(result));
14698 assert(PyUnicode_IS_ASCII(result));
14699
14700 /* To modify the string in-place, there can only be one reference. */
14701 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014702 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014703 PyErr_BadInternalCall();
14704 return NULL;
14705 }
14706 buf = PyUnicode_DATA(result);
14707 llen = PyUnicode_GET_LENGTH(result);
14708 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014709 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014710 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014711 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014712 return NULL;
14713 }
14714 len = (int)llen;
14715 sign = buf[0] == '-';
14716 numnondigits += sign;
14717 numdigits = len - numnondigits;
14718 assert(numdigits > 0);
14719
14720 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014721 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014722 (type == 'o' || type == 'x' || type == 'X'))) {
14723 assert(buf[sign] == '0');
14724 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14725 buf[sign+1] == 'o');
14726 numnondigits -= 2;
14727 buf += 2;
14728 len -= 2;
14729 if (sign)
14730 buf[0] = '-';
14731 assert(len == numnondigits + numdigits);
14732 assert(numdigits > 0);
14733 }
14734
14735 /* Fill with leading zeroes to meet minimum width. */
14736 if (prec > numdigits) {
14737 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14738 numnondigits + prec);
14739 char *b1;
14740 if (!r1) {
14741 Py_DECREF(result);
14742 return NULL;
14743 }
14744 b1 = PyBytes_AS_STRING(r1);
14745 for (i = 0; i < numnondigits; ++i)
14746 *b1++ = *buf++;
14747 for (i = 0; i < prec - numdigits; i++)
14748 *b1++ = '0';
14749 for (i = 0; i < numdigits; i++)
14750 *b1++ = *buf++;
14751 *b1 = '\0';
14752 Py_DECREF(result);
14753 result = r1;
14754 buf = PyBytes_AS_STRING(result);
14755 len = numnondigits + prec;
14756 }
14757
14758 /* Fix up case for hex conversions. */
14759 if (type == 'X') {
14760 /* Need to convert all lower case letters to upper case.
14761 and need to convert 0x to 0X (and -0x to -0X). */
14762 for (i = 0; i < len; i++)
14763 if (buf[i] >= 'a' && buf[i] <= 'x')
14764 buf[i] -= 'a'-'A';
14765 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014766 if (!PyUnicode_Check(result)
14767 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014768 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014769 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014770 Py_DECREF(result);
14771 result = unicode;
14772 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014773 else if (len != PyUnicode_GET_LENGTH(result)) {
14774 if (PyUnicode_Resize(&result, len) < 0)
14775 Py_CLEAR(result);
14776 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014777 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014778}
14779
Ethan Furmandf3ed242014-01-05 06:50:30 -080014780/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014781 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014782 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014783 * -1 and raise an exception on error */
14784static int
Victor Stinnera47082312012-10-04 02:19:54 +020014785mainformatlong(PyObject *v,
14786 struct unicode_format_arg_t *arg,
14787 PyObject **p_output,
14788 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014789{
14790 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014791 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014792
14793 if (!PyNumber_Check(v))
14794 goto wrongtype;
14795
Ethan Furman9ab74802014-03-21 06:38:46 -070014796 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014797 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014798 if (type == 'o' || type == 'x' || type == 'X') {
Serhiy Storchaka5f4b229d2020-05-28 10:33:45 +030014799 iobj = _PyNumber_Index(v);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014800 }
14801 else {
14802 iobj = PyNumber_Long(v);
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014803 }
14804 if (iobj == NULL ) {
14805 if (PyErr_ExceptionMatches(PyExc_TypeError))
14806 goto wrongtype;
14807 return -1;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014808 }
14809 assert(PyLong_Check(iobj));
14810 }
14811 else {
14812 iobj = v;
14813 Py_INCREF(iobj);
14814 }
14815
14816 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014817 && arg->width == -1 && arg->prec == -1
14818 && !(arg->flags & (F_SIGN | F_BLANK))
14819 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014820 {
14821 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014822 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014823 int base;
14824
Victor Stinnera47082312012-10-04 02:19:54 +020014825 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014826 {
14827 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014828 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014829 case 'd':
14830 case 'i':
14831 case 'u':
14832 base = 10;
14833 break;
14834 case 'o':
14835 base = 8;
14836 break;
14837 case 'x':
14838 case 'X':
14839 base = 16;
14840 break;
14841 }
14842
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014843 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14844 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014845 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014846 }
14847 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014848 return 1;
14849 }
14850
Ethan Furmanb95b5612015-01-23 20:05:18 -080014851 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014852 Py_DECREF(iobj);
14853 if (res == NULL)
14854 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014855 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014856 return 0;
14857
14858wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014859 switch(type)
14860 {
14861 case 'o':
14862 case 'x':
14863 case 'X':
14864 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014865 "%%%c format: an integer is required, "
14866 "not %.200s",
14867 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014868 break;
14869 default:
14870 PyErr_Format(PyExc_TypeError,
Serhiy Storchakae2ec0b22020-10-09 14:14:37 +030014871 "%%%c format: a real number is required, "
Victor Stinner998b8062018-09-12 00:23:25 +020014872 "not %.200s",
14873 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014874 break;
14875 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014876 return -1;
14877}
14878
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014879static Py_UCS4
14880formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014881{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014882 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014883 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014884 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014885 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014886 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014887 goto onError;
14888 }
14889 else {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014890 int overflow;
14891 long x = PyLong_AsLongAndOverflow(v, &overflow);
14892 if (x == -1 && PyErr_Occurred()) {
14893 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014894 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014895 }
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014896 return (Py_UCS4) -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014897 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014898
Victor Stinner8faf8212011-12-08 22:14:11 +010014899 if (x < 0 || x > MAX_UNICODE) {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014900 /* this includes an overflow in converting to C long */
Benjamin Peterson29060642009-01-31 22:14:21 +000014901 PyErr_SetString(PyExc_OverflowError,
14902 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014903 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014904 }
14905
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014906 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014907 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014908
Benjamin Peterson29060642009-01-31 22:14:21 +000014909 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014910 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014911 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014912 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014913}
14914
Victor Stinnera47082312012-10-04 02:19:54 +020014915/* Parse options of an argument: flags, width, precision.
14916 Handle also "%(name)" syntax.
14917
14918 Return 0 if the argument has been formatted into arg->str.
14919 Return 1 if the argument has been written into ctx->writer,
14920 Raise an exception and return -1 on error. */
14921static int
14922unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14923 struct unicode_format_arg_t *arg)
14924{
14925#define FORMAT_READ(ctx) \
14926 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14927
14928 PyObject *v;
14929
Victor Stinnera47082312012-10-04 02:19:54 +020014930 if (arg->ch == '(') {
14931 /* Get argument value from a dictionary. Example: "%(name)s". */
14932 Py_ssize_t keystart;
14933 Py_ssize_t keylen;
14934 PyObject *key;
14935 int pcount = 1;
14936
14937 if (ctx->dict == NULL) {
14938 PyErr_SetString(PyExc_TypeError,
14939 "format requires a mapping");
14940 return -1;
14941 }
14942 ++ctx->fmtpos;
14943 --ctx->fmtcnt;
14944 keystart = ctx->fmtpos;
14945 /* Skip over balanced parentheses */
14946 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14947 arg->ch = FORMAT_READ(ctx);
14948 if (arg->ch == ')')
14949 --pcount;
14950 else if (arg->ch == '(')
14951 ++pcount;
14952 ctx->fmtpos++;
14953 }
14954 keylen = ctx->fmtpos - keystart - 1;
14955 if (ctx->fmtcnt < 0 || pcount > 0) {
14956 PyErr_SetString(PyExc_ValueError,
14957 "incomplete format key");
14958 return -1;
14959 }
14960 key = PyUnicode_Substring(ctx->fmtstr,
14961 keystart, keystart + keylen);
14962 if (key == NULL)
14963 return -1;
14964 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014965 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014966 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014967 }
14968 ctx->args = PyObject_GetItem(ctx->dict, key);
14969 Py_DECREF(key);
14970 if (ctx->args == NULL)
14971 return -1;
14972 ctx->args_owned = 1;
14973 ctx->arglen = -1;
14974 ctx->argidx = -2;
14975 }
14976
14977 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014978 while (--ctx->fmtcnt >= 0) {
14979 arg->ch = FORMAT_READ(ctx);
14980 ctx->fmtpos++;
14981 switch (arg->ch) {
14982 case '-': arg->flags |= F_LJUST; continue;
14983 case '+': arg->flags |= F_SIGN; continue;
14984 case ' ': arg->flags |= F_BLANK; continue;
14985 case '#': arg->flags |= F_ALT; continue;
14986 case '0': arg->flags |= F_ZERO; continue;
14987 }
14988 break;
14989 }
14990
14991 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014992 if (arg->ch == '*') {
14993 v = unicode_format_getnextarg(ctx);
14994 if (v == NULL)
14995 return -1;
14996 if (!PyLong_Check(v)) {
14997 PyErr_SetString(PyExc_TypeError,
14998 "* wants int");
14999 return -1;
15000 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020015001 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020015002 if (arg->width == -1 && PyErr_Occurred())
15003 return -1;
15004 if (arg->width < 0) {
15005 arg->flags |= F_LJUST;
15006 arg->width = -arg->width;
15007 }
15008 if (--ctx->fmtcnt >= 0) {
15009 arg->ch = FORMAT_READ(ctx);
15010 ctx->fmtpos++;
15011 }
15012 }
15013 else if (arg->ch >= '0' && arg->ch <= '9') {
15014 arg->width = arg->ch - '0';
15015 while (--ctx->fmtcnt >= 0) {
15016 arg->ch = FORMAT_READ(ctx);
15017 ctx->fmtpos++;
15018 if (arg->ch < '0' || arg->ch > '9')
15019 break;
15020 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
15021 mixing signed and unsigned comparison. Since arg->ch is between
15022 '0' and '9', casting to int is safe. */
15023 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
15024 PyErr_SetString(PyExc_ValueError,
15025 "width too big");
15026 return -1;
15027 }
15028 arg->width = arg->width*10 + (arg->ch - '0');
15029 }
15030 }
15031
15032 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020015033 if (arg->ch == '.') {
15034 arg->prec = 0;
15035 if (--ctx->fmtcnt >= 0) {
15036 arg->ch = FORMAT_READ(ctx);
15037 ctx->fmtpos++;
15038 }
15039 if (arg->ch == '*') {
15040 v = unicode_format_getnextarg(ctx);
15041 if (v == NULL)
15042 return -1;
15043 if (!PyLong_Check(v)) {
15044 PyErr_SetString(PyExc_TypeError,
15045 "* wants int");
15046 return -1;
15047 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020015048 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020015049 if (arg->prec == -1 && PyErr_Occurred())
15050 return -1;
15051 if (arg->prec < 0)
15052 arg->prec = 0;
15053 if (--ctx->fmtcnt >= 0) {
15054 arg->ch = FORMAT_READ(ctx);
15055 ctx->fmtpos++;
15056 }
15057 }
15058 else if (arg->ch >= '0' && arg->ch <= '9') {
15059 arg->prec = arg->ch - '0';
15060 while (--ctx->fmtcnt >= 0) {
15061 arg->ch = FORMAT_READ(ctx);
15062 ctx->fmtpos++;
15063 if (arg->ch < '0' || arg->ch > '9')
15064 break;
15065 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
15066 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020015067 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020015068 return -1;
15069 }
15070 arg->prec = arg->prec*10 + (arg->ch - '0');
15071 }
15072 }
15073 }
15074
15075 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
15076 if (ctx->fmtcnt >= 0) {
15077 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
15078 if (--ctx->fmtcnt >= 0) {
15079 arg->ch = FORMAT_READ(ctx);
15080 ctx->fmtpos++;
15081 }
15082 }
15083 }
15084 if (ctx->fmtcnt < 0) {
15085 PyErr_SetString(PyExc_ValueError,
15086 "incomplete format");
15087 return -1;
15088 }
15089 return 0;
15090
15091#undef FORMAT_READ
15092}
15093
15094/* Format one argument. Supported conversion specifiers:
15095
15096 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080015097 - "i", "d", "u": int or float
15098 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020015099 - "e", "E", "f", "F", "g", "G": float
15100 - "c": int or str (1 character)
15101
Victor Stinner8dbd4212012-12-04 09:30:24 +010015102 When possible, the output is written directly into the Unicode writer
15103 (ctx->writer). A string is created when padding is required.
15104
Victor Stinnera47082312012-10-04 02:19:54 +020015105 Return 0 if the argument has been formatted into *p_str,
15106 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010015107 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020015108static int
15109unicode_format_arg_format(struct unicode_formatter_t *ctx,
15110 struct unicode_format_arg_t *arg,
15111 PyObject **p_str)
15112{
15113 PyObject *v;
15114 _PyUnicodeWriter *writer = &ctx->writer;
15115
15116 if (ctx->fmtcnt == 0)
15117 ctx->writer.overallocate = 0;
15118
Victor Stinnera47082312012-10-04 02:19:54 +020015119 v = unicode_format_getnextarg(ctx);
15120 if (v == NULL)
15121 return -1;
15122
Victor Stinnera47082312012-10-04 02:19:54 +020015123
15124 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020015125 case 's':
15126 case 'r':
15127 case 'a':
15128 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
15129 /* Fast path */
15130 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
15131 return -1;
15132 return 1;
15133 }
15134
15135 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
15136 *p_str = v;
15137 Py_INCREF(*p_str);
15138 }
15139 else {
15140 if (arg->ch == 's')
15141 *p_str = PyObject_Str(v);
15142 else if (arg->ch == 'r')
15143 *p_str = PyObject_Repr(v);
15144 else
15145 *p_str = PyObject_ASCII(v);
15146 }
15147 break;
15148
15149 case 'i':
15150 case 'd':
15151 case 'u':
15152 case 'o':
15153 case 'x':
15154 case 'X':
15155 {
15156 int ret = mainformatlong(v, arg, p_str, writer);
15157 if (ret != 0)
15158 return ret;
15159 arg->sign = 1;
15160 break;
15161 }
15162
15163 case 'e':
15164 case 'E':
15165 case 'f':
15166 case 'F':
15167 case 'g':
15168 case 'G':
15169 if (arg->width == -1 && arg->prec == -1
15170 && !(arg->flags & (F_SIGN | F_BLANK)))
15171 {
15172 /* Fast path */
15173 if (formatfloat(v, arg, NULL, writer) == -1)
15174 return -1;
15175 return 1;
15176 }
15177
15178 arg->sign = 1;
15179 if (formatfloat(v, arg, p_str, NULL) == -1)
15180 return -1;
15181 break;
15182
15183 case 'c':
15184 {
15185 Py_UCS4 ch = formatchar(v);
15186 if (ch == (Py_UCS4) -1)
15187 return -1;
15188 if (arg->width == -1 && arg->prec == -1) {
15189 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020015190 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020015191 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020015192 return 1;
15193 }
15194 *p_str = PyUnicode_FromOrdinal(ch);
15195 break;
15196 }
15197
15198 default:
15199 PyErr_Format(PyExc_ValueError,
15200 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020015201 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020015202 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15203 (int)arg->ch,
15204 ctx->fmtpos - 1);
15205 return -1;
15206 }
15207 if (*p_str == NULL)
15208 return -1;
15209 assert (PyUnicode_Check(*p_str));
15210 return 0;
15211}
15212
15213static int
15214unicode_format_arg_output(struct unicode_formatter_t *ctx,
15215 struct unicode_format_arg_t *arg,
15216 PyObject *str)
15217{
15218 Py_ssize_t len;
15219 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015220 const void *pbuf;
Victor Stinnera47082312012-10-04 02:19:54 +020015221 Py_ssize_t pindex;
15222 Py_UCS4 signchar;
15223 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015224 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015225 Py_ssize_t sublen;
15226 _PyUnicodeWriter *writer = &ctx->writer;
15227 Py_UCS4 fill;
15228
15229 fill = ' ';
15230 if (arg->sign && arg->flags & F_ZERO)
15231 fill = '0';
15232
15233 if (PyUnicode_READY(str) == -1)
15234 return -1;
15235
15236 len = PyUnicode_GET_LENGTH(str);
15237 if ((arg->width == -1 || arg->width <= len)
15238 && (arg->prec == -1 || arg->prec >= len)
15239 && !(arg->flags & (F_SIGN | F_BLANK)))
15240 {
15241 /* Fast path */
15242 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15243 return -1;
15244 return 0;
15245 }
15246
15247 /* Truncate the string for "s", "r" and "a" formats
15248 if the precision is set */
15249 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15250 if (arg->prec >= 0 && len > arg->prec)
15251 len = arg->prec;
15252 }
15253
15254 /* Adjust sign and width */
15255 kind = PyUnicode_KIND(str);
15256 pbuf = PyUnicode_DATA(str);
15257 pindex = 0;
15258 signchar = '\0';
15259 if (arg->sign) {
15260 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15261 if (ch == '-' || ch == '+') {
15262 signchar = ch;
15263 len--;
15264 pindex++;
15265 }
15266 else if (arg->flags & F_SIGN)
15267 signchar = '+';
15268 else if (arg->flags & F_BLANK)
15269 signchar = ' ';
15270 else
15271 arg->sign = 0;
15272 }
15273 if (arg->width < len)
15274 arg->width = len;
15275
15276 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015277 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015278 if (!(arg->flags & F_LJUST)) {
15279 if (arg->sign) {
15280 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015281 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015282 }
15283 else {
15284 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015285 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015286 }
15287 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015288 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15289 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015290 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015291 }
15292
Victor Stinnera47082312012-10-04 02:19:54 +020015293 buflen = arg->width;
15294 if (arg->sign && len == arg->width)
15295 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015296 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020015297 return -1;
15298
15299 /* Write the sign if needed */
15300 if (arg->sign) {
15301 if (fill != ' ') {
15302 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15303 writer->pos += 1;
15304 }
15305 if (arg->width > len)
15306 arg->width--;
15307 }
15308
15309 /* Write the numeric prefix for "x", "X" and "o" formats
15310 if the alternate form is used.
15311 For example, write "0x" for the "%#x" format. */
15312 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15313 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15314 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15315 if (fill != ' ') {
15316 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15317 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15318 writer->pos += 2;
15319 pindex += 2;
15320 }
15321 arg->width -= 2;
15322 if (arg->width < 0)
15323 arg->width = 0;
15324 len -= 2;
15325 }
15326
15327 /* Pad left with the fill character if needed */
15328 if (arg->width > len && !(arg->flags & F_LJUST)) {
15329 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015330 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015331 writer->pos += sublen;
15332 arg->width = len;
15333 }
15334
15335 /* If padding with spaces: write sign if needed and/or numeric prefix if
15336 the alternate form is used */
15337 if (fill == ' ') {
15338 if (arg->sign) {
15339 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15340 writer->pos += 1;
15341 }
15342 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15343 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15344 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15345 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15346 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15347 writer->pos += 2;
15348 pindex += 2;
15349 }
15350 }
15351
15352 /* Write characters */
15353 if (len) {
15354 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15355 str, pindex, len);
15356 writer->pos += len;
15357 }
15358
15359 /* Pad right with the fill character if needed */
15360 if (arg->width > len) {
15361 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015362 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015363 writer->pos += sublen;
15364 }
15365 return 0;
15366}
15367
15368/* Helper of PyUnicode_Format(): format one arg.
15369 Return 0 on success, raise an exception and return -1 on error. */
15370static int
15371unicode_format_arg(struct unicode_formatter_t *ctx)
15372{
15373 struct unicode_format_arg_t arg;
15374 PyObject *str;
15375 int ret;
15376
Victor Stinner8dbd4212012-12-04 09:30:24 +010015377 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015378 if (arg.ch == '%') {
15379 ctx->fmtpos++;
15380 ctx->fmtcnt--;
15381 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15382 return -1;
15383 return 0;
15384 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015385 arg.flags = 0;
15386 arg.width = -1;
15387 arg.prec = -1;
15388 arg.sign = 0;
15389 str = NULL;
15390
Victor Stinnera47082312012-10-04 02:19:54 +020015391 ret = unicode_format_arg_parse(ctx, &arg);
15392 if (ret == -1)
15393 return -1;
15394
15395 ret = unicode_format_arg_format(ctx, &arg, &str);
15396 if (ret == -1)
15397 return -1;
15398
15399 if (ret != 1) {
15400 ret = unicode_format_arg_output(ctx, &arg, str);
15401 Py_DECREF(str);
15402 if (ret == -1)
15403 return -1;
15404 }
15405
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015406 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015407 PyErr_SetString(PyExc_TypeError,
15408 "not all arguments converted during string formatting");
15409 return -1;
15410 }
15411 return 0;
15412}
15413
Alexander Belopolsky40018472011-02-26 01:02:56 +000015414PyObject *
15415PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015416{
Victor Stinnera47082312012-10-04 02:19:54 +020015417 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015418
Guido van Rossumd57fd912000-03-10 22:53:23 +000015419 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015420 PyErr_BadInternalCall();
15421 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015422 }
Victor Stinnera47082312012-10-04 02:19:54 +020015423
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015424 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015425 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015426
15427 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015428 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15429 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15430 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15431 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015432
Victor Stinner8f674cc2013-04-17 23:02:17 +020015433 _PyUnicodeWriter_Init(&ctx.writer);
15434 ctx.writer.min_length = ctx.fmtcnt + 100;
15435 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015436
Guido van Rossumd57fd912000-03-10 22:53:23 +000015437 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015438 ctx.arglen = PyTuple_Size(args);
15439 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015440 }
15441 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015442 ctx.arglen = -1;
15443 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015444 }
Victor Stinnera47082312012-10-04 02:19:54 +020015445 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015446 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015447 ctx.dict = args;
15448 else
15449 ctx.dict = NULL;
15450 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015451
Victor Stinnera47082312012-10-04 02:19:54 +020015452 while (--ctx.fmtcnt >= 0) {
15453 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015454 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015455
15456 nonfmtpos = ctx.fmtpos++;
15457 while (ctx.fmtcnt >= 0 &&
15458 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15459 ctx.fmtpos++;
15460 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015461 }
Victor Stinnera47082312012-10-04 02:19:54 +020015462 if (ctx.fmtcnt < 0) {
15463 ctx.fmtpos--;
15464 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015465 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015466
Victor Stinnercfc4c132013-04-03 01:48:39 +020015467 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15468 nonfmtpos, ctx.fmtpos) < 0)
15469 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015470 }
15471 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015472 ctx.fmtpos++;
15473 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015474 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015475 }
15476 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015477
Victor Stinnera47082312012-10-04 02:19:54 +020015478 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015479 PyErr_SetString(PyExc_TypeError,
15480 "not all arguments converted during string formatting");
15481 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015482 }
15483
Victor Stinnera47082312012-10-04 02:19:54 +020015484 if (ctx.args_owned) {
15485 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015486 }
Victor Stinnera47082312012-10-04 02:19:54 +020015487 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015488
Benjamin Peterson29060642009-01-31 22:14:21 +000015489 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015490 _PyUnicodeWriter_Dealloc(&ctx.writer);
15491 if (ctx.args_owned) {
15492 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015493 }
15494 return NULL;
15495}
15496
Jeremy Hylton938ace62002-07-17 16:30:39 +000015497static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015498unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
15499
15500/*[clinic input]
15501@classmethod
15502str.__new__ as unicode_new
15503
15504 object as x: object = NULL
15505 encoding: str = NULL
15506 errors: str = NULL
15507
15508[clinic start generated code]*/
Guido van Rossume023fe02001-08-30 03:12:59 +000015509
Tim Peters6d6c1a32001-08-02 04:15:00 +000015510static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015511unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
15512 const char *errors)
15513/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
Tim Peters6d6c1a32001-08-02 04:15:00 +000015514{
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015515 PyObject *unicode;
15516 if (x == NULL) {
15517 unicode = unicode_new_empty();
15518 }
15519 else if (encoding == NULL && errors == NULL) {
15520 unicode = PyObject_Str(x);
15521 }
15522 else {
15523 unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
15524 }
Tim Peters6d6c1a32001-08-02 04:15:00 +000015525
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015526 if (unicode != NULL && type != &PyUnicode_Type) {
15527 Py_SETREF(unicode, unicode_subtype_new(type, unicode));
15528 }
15529 return unicode;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015530}
15531
Guido van Rossume023fe02001-08-30 03:12:59 +000015532static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015533unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
Guido van Rossume023fe02001-08-30 03:12:59 +000015534{
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015535 PyObject *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015536 Py_ssize_t length, char_size;
15537 int share_wstr, share_utf8;
15538 unsigned int kind;
15539 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015540
Benjamin Peterson14339b62009-01-31 16:36:08 +000015541 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner910337b2011-10-03 03:20:16 +020015542 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015543 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015544 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015545 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015546
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015547 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015548 if (self == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015549 return NULL;
15550 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015551 kind = PyUnicode_KIND(unicode);
15552 length = PyUnicode_GET_LENGTH(unicode);
15553
15554 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015555#ifdef Py_DEBUG
15556 _PyUnicode_HASH(self) = -1;
15557#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015558 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015559#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015560 _PyUnicode_STATE(self).interned = 0;
15561 _PyUnicode_STATE(self).kind = kind;
15562 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015563 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015564 _PyUnicode_STATE(self).ready = 1;
15565 _PyUnicode_WSTR(self) = NULL;
15566 _PyUnicode_UTF8_LENGTH(self) = 0;
15567 _PyUnicode_UTF8(self) = NULL;
15568 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015569 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015570
15571 share_utf8 = 0;
15572 share_wstr = 0;
15573 if (kind == PyUnicode_1BYTE_KIND) {
15574 char_size = 1;
15575 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15576 share_utf8 = 1;
15577 }
15578 else if (kind == PyUnicode_2BYTE_KIND) {
15579 char_size = 2;
15580 if (sizeof(wchar_t) == 2)
15581 share_wstr = 1;
15582 }
15583 else {
15584 assert(kind == PyUnicode_4BYTE_KIND);
15585 char_size = 4;
15586 if (sizeof(wchar_t) == 4)
15587 share_wstr = 1;
15588 }
15589
15590 /* Ensure we won't overflow the length. */
15591 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15592 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015593 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015594 }
Victor Stinner32bd68c2020-12-01 10:37:39 +010015595 data = PyObject_Malloc((length + 1) * char_size);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015596 if (data == NULL) {
15597 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015598 goto onError;
15599 }
15600
Victor Stinnerc3c74152011-10-02 20:39:55 +020015601 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015602 if (share_utf8) {
15603 _PyUnicode_UTF8_LENGTH(self) = length;
15604 _PyUnicode_UTF8(self) = data;
15605 }
15606 if (share_wstr) {
15607 _PyUnicode_WSTR_LENGTH(self) = length;
15608 _PyUnicode_WSTR(self) = (wchar_t *)data;
15609 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015610
Christian Heimesf051e432016-09-13 20:22:02 +020015611 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015612 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015613 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015614#ifdef Py_DEBUG
15615 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15616#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +010015617 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015618
15619onError:
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015620 Py_DECREF(self);
15621 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015622}
15623
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015624PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015625"str(object='') -> str\n\
15626str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015627\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015628Create a new string object from the given object. If encoding or\n\
15629errors is specified, then the object must expose a data buffer\n\
15630that will be decoded using the given encoding and error handler.\n\
15631Otherwise, returns the result of object.__str__() (if defined)\n\
15632or repr(object).\n\
15633encoding defaults to sys.getdefaultencoding().\n\
15634errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015635
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015636static PyObject *unicode_iter(PyObject *seq);
15637
Guido van Rossumd57fd912000-03-10 22:53:23 +000015638PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015639 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015640 "str", /* tp_name */
15641 sizeof(PyUnicodeObject), /* tp_basicsize */
15642 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015643 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015644 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015645 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015646 0, /* tp_getattr */
15647 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015648 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015649 unicode_repr, /* tp_repr */
15650 &unicode_as_number, /* tp_as_number */
15651 &unicode_as_sequence, /* tp_as_sequence */
15652 &unicode_as_mapping, /* tp_as_mapping */
15653 (hashfunc) unicode_hash, /* tp_hash*/
15654 0, /* tp_call*/
15655 (reprfunc) unicode_str, /* tp_str */
15656 PyObject_GenericGetAttr, /* tp_getattro */
15657 0, /* tp_setattro */
15658 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015659 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015660 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15661 unicode_doc, /* tp_doc */
15662 0, /* tp_traverse */
15663 0, /* tp_clear */
15664 PyUnicode_RichCompare, /* tp_richcompare */
15665 0, /* tp_weaklistoffset */
15666 unicode_iter, /* tp_iter */
15667 0, /* tp_iternext */
15668 unicode_methods, /* tp_methods */
15669 0, /* tp_members */
15670 0, /* tp_getset */
15671 &PyBaseObject_Type, /* tp_base */
15672 0, /* tp_dict */
15673 0, /* tp_descr_get */
15674 0, /* tp_descr_set */
15675 0, /* tp_dictoffset */
15676 0, /* tp_init */
15677 0, /* tp_alloc */
15678 unicode_new, /* tp_new */
15679 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015680};
15681
15682/* Initialize the Unicode implementation */
15683
Victor Stinner331a6a52019-05-27 16:39:22 +020015684PyStatus
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015685_PyUnicode_Init(PyThreadState *tstate)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015686{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015687 /* XXX - move this array to unicodectype.c ? */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015688 const Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015689 0x000A, /* LINE FEED */
15690 0x000D, /* CARRIAGE RETURN */
15691 0x001C, /* FILE SEPARATOR */
15692 0x001D, /* GROUP SEPARATOR */
15693 0x001E, /* RECORD SEPARATOR */
15694 0x0085, /* NEXT LINE */
15695 0x2028, /* LINE SEPARATOR */
15696 0x2029, /* PARAGRAPH SEPARATOR */
15697 };
15698
Victor Stinner91698d82020-06-25 14:07:40 +020015699 struct _Py_unicode_state *state = &tstate->interp->unicode;
15700 if (unicode_create_empty_string_singleton(state) < 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015701 return _PyStatus_NO_MEMORY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015702 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015703
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015704 if (_Py_IsMainInterpreter(tstate)) {
15705 /* initialize the linebreak bloom filter */
15706 bloom_linebreak = make_bloom_mask(
15707 PyUnicode_2BYTE_KIND, linebreak,
15708 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters477c8d52006-05-27 19:21:47 +000015709
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015710 if (PyType_Ready(&PyUnicode_Type) < 0) {
15711 return _PyStatus_ERR("Can't initialize unicode type");
15712 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015713
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015714 if (PyType_Ready(&EncodingMapType) < 0) {
15715 return _PyStatus_ERR("Can't initialize encoding map type");
15716 }
15717 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15718 return _PyStatus_ERR("Can't initialize field name iterator type");
15719 }
15720 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15721 return _PyStatus_ERR("Can't initialize formatter iter type");
15722 }
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015723 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015724 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015725}
15726
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015727
Walter Dörwald16807132007-05-25 13:52:07 +000015728void
15729PyUnicode_InternInPlace(PyObject **p)
15730{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015731 PyObject *s = *p;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015732#ifdef Py_DEBUG
15733 assert(s != NULL);
15734 assert(_PyUnicode_CHECK(s));
15735#else
Victor Stinner607b1022020-05-05 18:50:30 +020015736 if (s == NULL || !PyUnicode_Check(s)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020015737 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015738 }
Victor Stinner4fae54c2011-10-03 02:01:52 +020015739#endif
Victor Stinner607b1022020-05-05 18:50:30 +020015740
Benjamin Peterson14339b62009-01-31 16:36:08 +000015741 /* If it's a subclass, we don't really know what putting
15742 it in the interned dict might do. */
Victor Stinner607b1022020-05-05 18:50:30 +020015743 if (!PyUnicode_CheckExact(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015744 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015745 }
15746
15747 if (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015748 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015749 }
15750
Victor Stinner666ecfb2020-07-02 01:19:57 +020015751 if (PyUnicode_READY(s) == -1) {
15752 PyErr_Clear();
15753 return;
15754 }
15755
Victor Stinnerea251802020-12-26 02:58:33 +010015756 struct _Py_unicode_state *state = get_unicode_state();
15757 if (state->interned == NULL) {
15758 state->interned = PyDict_New();
15759 if (state->interned == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015760 PyErr_Clear(); /* Don't leave an exception */
15761 return;
15762 }
15763 }
Victor Stinner607b1022020-05-05 18:50:30 +020015764
Victor Stinnerea251802020-12-26 02:58:33 +010015765 PyObject *t = PyDict_SetDefault(state->interned, s, s);
Berker Peksagced8d4c2016-07-25 04:40:39 +030015766 if (t == NULL) {
15767 PyErr_Clear();
15768 return;
15769 }
Victor Stinner607b1022020-05-05 18:50:30 +020015770
Berker Peksagced8d4c2016-07-25 04:40:39 +030015771 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015772 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015773 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015774 return;
15775 }
Victor Stinner607b1022020-05-05 18:50:30 +020015776
Victor Stinner3549ca32020-07-03 16:59:12 +020015777 /* The two references in interned dict (key and value) are not counted by
15778 refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
15779 this. */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015780 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015781 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015782}
15783
Victor Stinnerea251802020-12-26 02:58:33 +010015784
Walter Dörwald16807132007-05-25 13:52:07 +000015785void
15786PyUnicode_InternImmortal(PyObject **p)
15787{
Victor Stinner583ee5a2020-10-02 14:49:00 +020015788 if (PyErr_WarnEx(PyExc_DeprecationWarning,
15789 "PyUnicode_InternImmortal() is deprecated; "
15790 "use PyUnicode_InternInPlace() instead", 1) < 0)
15791 {
15792 // The function has no return value, the exception cannot
15793 // be reported to the caller, so just log it.
15794 PyErr_WriteUnraisable(NULL);
15795 }
15796
Benjamin Peterson14339b62009-01-31 16:36:08 +000015797 PyUnicode_InternInPlace(p);
15798 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015799 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015800 Py_INCREF(*p);
15801 }
Walter Dörwald16807132007-05-25 13:52:07 +000015802}
15803
15804PyObject *
15805PyUnicode_InternFromString(const char *cp)
15806{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015807 PyObject *s = PyUnicode_FromString(cp);
15808 if (s == NULL)
15809 return NULL;
15810 PyUnicode_InternInPlace(&s);
15811 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015812}
15813
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015814
Victor Stinner666ecfb2020-07-02 01:19:57 +020015815void
15816_PyUnicode_ClearInterned(PyThreadState *tstate)
Walter Dörwald16807132007-05-25 13:52:07 +000015817{
Victor Stinnerea251802020-12-26 02:58:33 +010015818 struct _Py_unicode_state *state = &tstate->interp->unicode;
15819 if (state->interned == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015820 return;
15821 }
Victor Stinnerea251802020-12-26 02:58:33 +010015822 assert(PyDict_CheckExact(state->interned));
Victor Stinner666ecfb2020-07-02 01:19:57 +020015823
15824 /* Interned unicode strings are not forcibly deallocated; rather, we give
15825 them their stolen references back, and then clear and DECREF the
15826 interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015827
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015828#ifdef INTERNED_STATS
Victor Stinnerea251802020-12-26 02:58:33 +010015829 fprintf(stderr, "releasing %zd interned strings\n",
15830 PyDict_GET_SIZE(state->interned));
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015831
15832 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015833#endif
Victor Stinnerea251802020-12-26 02:58:33 +010015834 Py_ssize_t pos = 0;
15835 PyObject *s, *ignored_value;
15836 while (PyDict_Next(state->interned, &pos, &s, &ignored_value)) {
Victor Stinner666ecfb2020-07-02 01:19:57 +020015837 assert(PyUnicode_IS_READY(s));
15838
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015839 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015840 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015841 Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015842#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015843 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015844#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015845 break;
15846 case SSTATE_INTERNED_MORTAL:
Victor Stinner3549ca32020-07-03 16:59:12 +020015847 // Restore the two references (key and value) ignored
15848 // by PyUnicode_InternInPlace().
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015849 Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015850#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015851 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015852#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015853 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015854 case SSTATE_NOT_INTERNED:
15855 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015856 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015857 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015858 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015859 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015860 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015861#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015862 fprintf(stderr,
15863 "total size of all interned strings: %zd/%zd mortal/immortal\n",
15864 mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015865#endif
Victor Stinner666ecfb2020-07-02 01:19:57 +020015866
Victor Stinnerea251802020-12-26 02:58:33 +010015867 PyDict_Clear(state->interned);
15868 Py_CLEAR(state->interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015869}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015870
15871
15872/********************* Unicode Iterator **************************/
15873
15874typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015875 PyObject_HEAD
15876 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015877 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015878} unicodeiterobject;
15879
15880static void
15881unicodeiter_dealloc(unicodeiterobject *it)
15882{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015883 _PyObject_GC_UNTRACK(it);
15884 Py_XDECREF(it->it_seq);
15885 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015886}
15887
15888static int
15889unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15890{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015891 Py_VISIT(it->it_seq);
15892 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015893}
15894
15895static PyObject *
15896unicodeiter_next(unicodeiterobject *it)
15897{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015898 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015899
Benjamin Peterson14339b62009-01-31 16:36:08 +000015900 assert(it != NULL);
15901 seq = it->it_seq;
15902 if (seq == NULL)
15903 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015904 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015906 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15907 int kind = PyUnicode_KIND(seq);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015908 const void *data = PyUnicode_DATA(seq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015909 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15910 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015911 if (item != NULL)
15912 ++it->it_index;
15913 return item;
15914 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015915
Benjamin Peterson14339b62009-01-31 16:36:08 +000015916 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015917 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015918 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015919}
15920
15921static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015922unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015923{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015924 Py_ssize_t len = 0;
15925 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015926 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015927 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015928}
15929
15930PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15931
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015932static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015933unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015934{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015935 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015936 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015937 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015938 it->it_seq, it->it_index);
15939 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015940 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015941 if (u == NULL)
15942 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015943 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015944 }
15945}
15946
15947PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15948
15949static PyObject *
15950unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15951{
15952 Py_ssize_t index = PyLong_AsSsize_t(state);
15953 if (index == -1 && PyErr_Occurred())
15954 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015955 if (it->it_seq != NULL) {
15956 if (index < 0)
15957 index = 0;
15958 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15959 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15960 it->it_index = index;
15961 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015962 Py_RETURN_NONE;
15963}
15964
15965PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15966
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015967static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015968 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015969 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015970 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15971 reduce_doc},
15972 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15973 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015974 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015975};
15976
15977PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015978 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15979 "str_iterator", /* tp_name */
15980 sizeof(unicodeiterobject), /* tp_basicsize */
15981 0, /* tp_itemsize */
15982 /* methods */
15983 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015984 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015985 0, /* tp_getattr */
15986 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015987 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015988 0, /* tp_repr */
15989 0, /* tp_as_number */
15990 0, /* tp_as_sequence */
15991 0, /* tp_as_mapping */
15992 0, /* tp_hash */
15993 0, /* tp_call */
15994 0, /* tp_str */
15995 PyObject_GenericGetAttr, /* tp_getattro */
15996 0, /* tp_setattro */
15997 0, /* tp_as_buffer */
15998 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15999 0, /* tp_doc */
16000 (traverseproc)unicodeiter_traverse, /* tp_traverse */
16001 0, /* tp_clear */
16002 0, /* tp_richcompare */
16003 0, /* tp_weaklistoffset */
16004 PyObject_SelfIter, /* tp_iter */
16005 (iternextfunc)unicodeiter_next, /* tp_iternext */
16006 unicodeiter_methods, /* tp_methods */
16007 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016008};
16009
16010static PyObject *
16011unicode_iter(PyObject *seq)
16012{
Benjamin Peterson14339b62009-01-31 16:36:08 +000016013 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016014
Benjamin Peterson14339b62009-01-31 16:36:08 +000016015 if (!PyUnicode_Check(seq)) {
16016 PyErr_BadInternalCall();
16017 return NULL;
16018 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020016019 if (PyUnicode_READY(seq) == -1)
16020 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000016021 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
16022 if (it == NULL)
16023 return NULL;
16024 it->it_index = 0;
16025 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020016026 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000016027 _PyObject_GC_TRACK(it);
16028 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016029}
16030
Victor Stinner709d23d2019-05-02 14:56:30 -040016031static int
16032encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016033{
Victor Stinner709d23d2019-05-02 14:56:30 -040016034 int res;
16035 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
16036 if (res == -2) {
16037 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
16038 return -1;
16039 }
16040 if (res < 0) {
16041 PyErr_NoMemory();
16042 return -1;
16043 }
16044 return 0;
16045}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016046
Victor Stinner709d23d2019-05-02 14:56:30 -040016047
16048static int
16049config_get_codec_name(wchar_t **config_encoding)
16050{
16051 char *encoding;
16052 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
16053 return -1;
16054 }
16055
16056 PyObject *name_obj = NULL;
16057 PyObject *codec = _PyCodec_Lookup(encoding);
16058 PyMem_RawFree(encoding);
16059
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016060 if (!codec)
16061 goto error;
16062
16063 name_obj = PyObject_GetAttrString(codec, "name");
16064 Py_CLEAR(codec);
16065 if (!name_obj) {
16066 goto error;
16067 }
16068
Victor Stinner709d23d2019-05-02 14:56:30 -040016069 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16070 Py_DECREF(name_obj);
16071 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016072 goto error;
16073 }
16074
Victor Stinner709d23d2019-05-02 14:56:30 -040016075 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16076 if (raw_wname == NULL) {
16077 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016078 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040016079 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016080 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016081
16082 PyMem_RawFree(*config_encoding);
16083 *config_encoding = raw_wname;
16084
16085 PyMem_Free(wname);
16086 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016087
16088error:
16089 Py_XDECREF(codec);
16090 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040016091 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016092}
16093
16094
Victor Stinner331a6a52019-05-27 16:39:22 +020016095static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016096init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016097{
Victor Stinner709d23d2019-05-02 14:56:30 -040016098 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016099 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(tstate->interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016100 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016101 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016102 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016103 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016104 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016105}
16106
16107
Victor Stinner709d23d2019-05-02 14:56:30 -040016108static int
16109init_fs_codec(PyInterpreterState *interp)
16110{
Victor Stinnerda7933e2020-04-13 03:04:28 +020016111 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016112
16113 _Py_error_handler error_handler;
16114 error_handler = get_error_handler_wide(config->filesystem_errors);
16115 if (error_handler == _Py_ERROR_UNKNOWN) {
16116 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
16117 return -1;
16118 }
16119
16120 char *encoding, *errors;
16121 if (encode_wstr_utf8(config->filesystem_encoding,
16122 &encoding,
16123 "filesystem_encoding") < 0) {
16124 return -1;
16125 }
16126
16127 if (encode_wstr_utf8(config->filesystem_errors,
16128 &errors,
16129 "filesystem_errors") < 0) {
16130 PyMem_RawFree(encoding);
16131 return -1;
16132 }
16133
Victor Stinner3d17c042020-05-14 01:48:38 +020016134 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16135 PyMem_RawFree(fs_codec->encoding);
16136 fs_codec->encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016137 /* encoding has been normalized by init_fs_encoding() */
Victor Stinner3d17c042020-05-14 01:48:38 +020016138 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16139 PyMem_RawFree(fs_codec->errors);
16140 fs_codec->errors = errors;
16141 fs_codec->error_handler = error_handler;
Victor Stinner709d23d2019-05-02 14:56:30 -040016142
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016143#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +020016144 assert(fs_codec->utf8 == 1);
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016145#endif
16146
Victor Stinner709d23d2019-05-02 14:56:30 -040016147 /* At this point, PyUnicode_EncodeFSDefault() and
16148 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16149 the C implementation of the filesystem encoding. */
16150
16151 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16152 global configuration variables. */
Victor Stinner3d17c042020-05-14 01:48:38 +020016153 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16154 fs_codec->errors) < 0) {
Victor Stinner709d23d2019-05-02 14:56:30 -040016155 PyErr_NoMemory();
16156 return -1;
16157 }
16158 return 0;
16159}
16160
16161
Victor Stinner331a6a52019-05-27 16:39:22 +020016162static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016163init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016164{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016165 PyInterpreterState *interp = tstate->interp;
16166
Victor Stinner709d23d2019-05-02 14:56:30 -040016167 /* Update the filesystem encoding to the normalized Python codec name.
16168 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16169 (Python codec name). */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016170 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016171 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016172 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016173 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016174 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016175 }
16176
Victor Stinner709d23d2019-05-02 14:56:30 -040016177 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016178 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016179 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016180 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016181}
16182
16183
Victor Stinner331a6a52019-05-27 16:39:22 +020016184PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020016185_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016186{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016187 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016188 if (_PyStatus_EXCEPTION(status)) {
16189 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016190 }
16191
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016192 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016193}
16194
16195
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016196static void
Victor Stinner3d17c042020-05-14 01:48:38 +020016197_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016198{
Victor Stinner3d17c042020-05-14 01:48:38 +020016199 PyMem_RawFree(fs_codec->encoding);
16200 fs_codec->encoding = NULL;
16201 fs_codec->utf8 = 0;
16202 PyMem_RawFree(fs_codec->errors);
16203 fs_codec->errors = NULL;
16204 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016205}
16206
16207
Victor Stinner709d23d2019-05-02 14:56:30 -040016208#ifdef MS_WINDOWS
16209int
16210_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16211{
Victor Stinner81a7be32020-04-14 15:14:01 +020016212 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerda7933e2020-04-13 03:04:28 +020016213 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016214
16215 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16216 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16217 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16218 if (encoding == NULL || errors == NULL) {
16219 PyMem_RawFree(encoding);
16220 PyMem_RawFree(errors);
16221 PyErr_NoMemory();
16222 return -1;
16223 }
16224
16225 PyMem_RawFree(config->filesystem_encoding);
16226 config->filesystem_encoding = encoding;
16227 PyMem_RawFree(config->filesystem_errors);
16228 config->filesystem_errors = errors;
16229
16230 return init_fs_codec(interp);
16231}
16232#endif
16233
16234
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016235void
Victor Stinner3d483342019-11-22 12:27:50 +010016236_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016237{
Victor Stinner666ecfb2020-07-02 01:19:57 +020016238 struct _Py_unicode_state *state = &tstate->interp->unicode;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016239
Victor Stinnerea251802020-12-26 02:58:33 +010016240 // _PyUnicode_ClearInterned() must be called before
16241 assert(state->interned == NULL);
16242
16243 _PyUnicode_FiniEncodings(&state->fs_codec);
16244
Victor Stinnerf4507232020-12-26 20:26:08 +010016245 unicode_clear_identifiers(state);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016246
Victor Stinner2f9ada92020-06-24 02:22:21 +020016247 for (Py_ssize_t i = 0; i < 256; i++) {
16248 Py_CLEAR(state->latin1[i]);
16249 }
Victor Stinnerea251802020-12-26 02:58:33 +010016250 Py_CLEAR(state->empty_string);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016251}
16252
16253
Georg Brandl66c221e2010-10-14 07:04:07 +000016254/* A _string module, to export formatter_parser and formatter_field_name_split
16255 to the string.Formatter class implemented in Python. */
16256
16257static PyMethodDef _string_methods[] = {
16258 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16259 METH_O, PyDoc_STR("split the argument as a field name")},
16260 {"formatter_parser", (PyCFunction) formatter_parser,
16261 METH_O, PyDoc_STR("parse the argument as a format string")},
16262 {NULL, NULL}
16263};
16264
16265static struct PyModuleDef _string_module = {
16266 PyModuleDef_HEAD_INIT,
Victor Stinnerbb083d32020-09-08 15:33:08 +020016267 .m_name = "_string",
16268 .m_doc = PyDoc_STR("string helper module"),
16269 .m_size = 0,
16270 .m_methods = _string_methods,
Georg Brandl66c221e2010-10-14 07:04:07 +000016271};
16272
16273PyMODINIT_FUNC
16274PyInit__string(void)
16275{
Victor Stinnerbb083d32020-09-08 15:33:08 +020016276 return PyModuleDef_Init(&_string_module);
Georg Brandl66c221e2010-10-14 07:04:07 +000016277}
16278
16279
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016280#ifdef __cplusplus
16281}
16282#endif