blob: a03ca9a10d152ff17f718f4687e99b9e79edef19 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner47e1afd2020-10-26 16:43:47 +010043#include "pycore_abstract.h" // _PyIndex_Check()
Victor Stinnerba3d67c2020-12-26 00:41:46 +010044#include "pycore_atomic_funcs.h" // _Py_atomic_size_get()
Victor Stinner47e1afd2020-10-26 16:43:47 +010045#include "pycore_bytes_methods.h" // _Py_bytes_lower()
Serhiy Storchaka2ad93822020-12-03 12:46:16 +020046#include "pycore_format.h" // F_LJUST
Victor Stinner47e1afd2020-10-26 16:43:47 +010047#include "pycore_initconfig.h" // _PyStatus_OK()
48#include "pycore_interp.h" // PyInterpreterState.fs_codec
49#include "pycore_object.h" // _PyObject_GC_TRACK()
50#include "pycore_pathconfig.h" // _Py_DumpPathConfig()
51#include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding()
52#include "pycore_pystate.h" // _PyInterpreterState_GET()
53#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
54#include "stringlib/eq.h" // unicode_eq()
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000056#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000057#include <windows.h>
58#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059
Victor Stinner666ecfb2020-07-02 01:19:57 +020060/* Uncomment to display statistics on interned strings at exit
61 in _PyUnicode_ClearInterned(). */
Victor Stinnerfecc4f22019-03-19 14:20:29 +010062/* #define INTERNED_STATS 1 */
63
64
Larry Hastings61272b72014-01-07 12:41:53 -080065/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090066class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080067[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090068/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
69
70/*[python input]
71class Py_UCS4_converter(CConverter):
72 type = 'Py_UCS4'
73 converter = 'convert_uc'
74
75 def converter_init(self):
76 if self.default is not unspecified:
77 self.c_default = ascii(self.default)
78 if len(self.c_default) > 4 or self.c_default[0] != "'":
79 self.c_default = hex(ord(self.default))
80
81[python start generated code]*/
82/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080083
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000084/* --- Globals ------------------------------------------------------------
85
Serhiy Storchaka05997252013-01-26 12:14:02 +020086NOTE: In the interpreter's initialization phase, some globals are currently
87 initialized dynamically as needed. In the process Unicode objects may
88 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000089
90*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000091
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000092
93#ifdef __cplusplus
94extern "C" {
95#endif
96
Victor Stinner8faf8212011-12-08 22:14:11 +010097/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
98#define MAX_UNICODE 0x10ffff
99
Victor Stinner910337b2011-10-03 03:20:16 +0200100#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200101# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +0200102#else
103# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
104#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200105
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106#define _PyUnicode_UTF8(op) \
107 (((PyCompactUnicodeObject*)(op))->utf8)
108#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200109 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200110 assert(PyUnicode_IS_READY(op)), \
111 PyUnicode_IS_COMPACT_ASCII(op) ? \
112 ((char*)((PyASCIIObject*)(op) + 1)) : \
113 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200114#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200115 (((PyCompactUnicodeObject*)(op))->utf8_length)
116#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200117 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200118 assert(PyUnicode_IS_READY(op)), \
119 PyUnicode_IS_COMPACT_ASCII(op) ? \
120 ((PyASCIIObject*)(op))->length : \
121 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200122#define _PyUnicode_WSTR(op) \
123 (((PyASCIIObject*)(op))->wstr)
Inada Naoki2c4928d2020-06-17 20:09:44 +0900124
125/* Don't use deprecated macro of unicodeobject.h */
126#undef PyUnicode_WSTR_LENGTH
127#define PyUnicode_WSTR_LENGTH(op) \
128 (PyUnicode_IS_COMPACT_ASCII(op) ? \
129 ((PyASCIIObject*)op)->length : \
130 ((PyCompactUnicodeObject*)op)->wstr_length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200131#define _PyUnicode_WSTR_LENGTH(op) \
132 (((PyCompactUnicodeObject*)(op))->wstr_length)
133#define _PyUnicode_LENGTH(op) \
134 (((PyASCIIObject *)(op))->length)
135#define _PyUnicode_STATE(op) \
136 (((PyASCIIObject *)(op))->state)
137#define _PyUnicode_HASH(op) \
138 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200139#define _PyUnicode_KIND(op) \
140 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200141 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200142#define _PyUnicode_GET_LENGTH(op) \
143 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200144 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200145#define _PyUnicode_DATA_ANY(op) \
146 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200147
Victor Stinner910337b2011-10-03 03:20:16 +0200148#undef PyUnicode_READY
149#define PyUnicode_READY(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200152 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100153 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200154
Victor Stinnerc379ead2011-10-03 12:52:27 +0200155#define _PyUnicode_SHARE_UTF8(op) \
156 (assert(_PyUnicode_CHECK(op)), \
157 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
158 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
159#define _PyUnicode_SHARE_WSTR(op) \
160 (assert(_PyUnicode_CHECK(op)), \
161 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
162
Victor Stinner829c0ad2011-10-03 01:08:02 +0200163/* true if the Unicode object has an allocated UTF-8 memory block
164 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200165#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200166 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200167 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200168 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
169
Victor Stinner03490912011-10-03 23:45:12 +0200170/* true if the Unicode object has an allocated wstr memory block
171 (not shared with other data) */
172#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200173 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200174 (!PyUnicode_IS_READY(op) || \
175 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
176
Victor Stinner910337b2011-10-03 03:20:16 +0200177/* Generic helper macro to convert characters of different types.
178 from_type and to_type have to be valid type names, begin and end
179 are pointers to the source characters which should be of type
180 "from_type *". to is a pointer of type "to_type *" and points to the
181 buffer where the result characters are written to. */
182#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
183 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100184 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600185 const from_type *_iter = (const from_type *)(begin);\
186 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200187 Py_ssize_t n = (_end) - (_iter); \
188 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200189 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200190 while (_iter < (_unrolled_end)) { \
191 _to[0] = (to_type) _iter[0]; \
192 _to[1] = (to_type) _iter[1]; \
193 _to[2] = (to_type) _iter[2]; \
194 _to[3] = (to_type) _iter[3]; \
195 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200196 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200197 while (_iter < (_end)) \
198 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200199 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200200
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200201#ifdef MS_WINDOWS
202 /* On Windows, overallocate by 50% is the best factor */
203# define OVERALLOCATE_FACTOR 2
204#else
205 /* On Linux, overallocate by 25% is the best factor */
206# define OVERALLOCATE_FACTOR 4
207#endif
208
Walter Dörwald16807132007-05-25 13:52:07 +0000209
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200210static struct _Py_unicode_state*
211get_unicode_state(void)
212{
213 PyInterpreterState *interp = _PyInterpreterState_GET();
214 return &interp->unicode;
215}
Serhiy Storchaka05997252013-01-26 12:14:02 +0200216
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000217
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200218// Return a borrowed reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200219static inline PyObject* unicode_get_empty(void)
220{
221 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner90ed8a62020-06-24 00:34:07 +0200222 // unicode_get_empty() must not be called before _PyUnicode_Init()
223 // or after _PyUnicode_Fini()
Victor Stinner91698d82020-06-25 14:07:40 +0200224 assert(state->empty_string != NULL);
225 return state->empty_string;
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200226}
227
Victor Stinner91698d82020-06-25 14:07:40 +0200228
229// Return a strong reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200230static inline PyObject* unicode_new_empty(void)
231{
Victor Stinner90ed8a62020-06-24 00:34:07 +0200232 PyObject *empty = unicode_get_empty();
233 Py_INCREF(empty);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200234 return empty;
235}
236
237#define _Py_RETURN_UNICODE_EMPTY() \
238 do { \
239 return unicode_new_empty(); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200240 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000241
Victor Stinner59423e32018-11-26 13:40:01 +0100242static inline void
243unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
244 Py_ssize_t start, Py_ssize_t length)
245{
246 assert(0 <= start);
247 assert(kind != PyUnicode_WCHAR_KIND);
248 switch (kind) {
249 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100250 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100251 Py_UCS1 ch = (unsigned char)value;
252 Py_UCS1 *to = (Py_UCS1 *)data + start;
253 memset(to, ch, length);
254 break;
255 }
256 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100257 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100258 Py_UCS2 ch = (Py_UCS2)value;
259 Py_UCS2 *to = (Py_UCS2 *)data + start;
260 const Py_UCS2 *end = to + length;
261 for (; to < end; ++to) *to = ch;
262 break;
263 }
264 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100265 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100266 Py_UCS4 ch = value;
267 Py_UCS4 * to = (Py_UCS4 *)data + start;
268 const Py_UCS4 *end = to + length;
269 for (; to < end; ++to) *to = ch;
270 break;
271 }
272 default: Py_UNREACHABLE();
273 }
274}
275
276
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200277/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700278static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200279_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900280static inline void
281_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400282static PyObject *
283unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
284 const char *errors);
285static PyObject *
286unicode_decode_utf8(const char *s, Py_ssize_t size,
287 _Py_error_handler error_handler, const char *errors,
288 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200289
Christian Heimes190d79e2008-01-30 11:58:22 +0000290/* Fast detection of the most frequent whitespace characters */
291const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000292 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000293/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000294/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000295/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000296/* case 0x000C: * FORM FEED */
297/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000298 0, 1, 1, 1, 1, 1, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000300/* case 0x001C: * FILE SEPARATOR */
301/* case 0x001D: * GROUP SEPARATOR */
302/* case 0x001E: * RECORD SEPARATOR */
303/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000304 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000305/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000306 1, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000310
Benjamin Peterson14339b62009-01-31 16:36:08 +0000311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0,
315 0, 0, 0, 0, 0, 0, 0, 0,
316 0, 0, 0, 0, 0, 0, 0, 0,
317 0, 0, 0, 0, 0, 0, 0, 0,
318 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000319};
320
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200321/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200322static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200323static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100324static int unicode_modifiable(PyObject *unicode);
325
Victor Stinnerfe226c02011-10-03 03:52:20 +0200326
Alexander Belopolsky40018472011-02-26 01:02:56 +0000327static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100328_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200329static PyObject *
330_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
331static PyObject *
332_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
333
334static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000335unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000336 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100337 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000338 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
339
Alexander Belopolsky40018472011-02-26 01:02:56 +0000340static void
341raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300342 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100343 PyObject *unicode,
344 Py_ssize_t startpos, Py_ssize_t endpos,
345 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000346
Christian Heimes190d79e2008-01-30 11:58:22 +0000347/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200348static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000349 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000350/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000351/* 0x000B, * LINE TABULATION */
352/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000353/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000354 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000355 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000356/* 0x001C, * FILE SEPARATOR */
357/* 0x001D, * GROUP SEPARATOR */
358/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000359 0, 0, 0, 0, 1, 1, 1, 0,
360 0, 0, 0, 0, 0, 0, 0, 0,
361 0, 0, 0, 0, 0, 0, 0, 0,
362 0, 0, 0, 0, 0, 0, 0, 0,
363 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000364
Benjamin Peterson14339b62009-01-31 16:36:08 +0000365 0, 0, 0, 0, 0, 0, 0, 0,
366 0, 0, 0, 0, 0, 0, 0, 0,
367 0, 0, 0, 0, 0, 0, 0, 0,
368 0, 0, 0, 0, 0, 0, 0, 0,
369 0, 0, 0, 0, 0, 0, 0, 0,
370 0, 0, 0, 0, 0, 0, 0, 0,
371 0, 0, 0, 0, 0, 0, 0, 0,
372 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000373};
374
INADA Naoki3ae20562017-01-16 20:41:20 +0900375static int convert_uc(PyObject *obj, void *addr);
376
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300377#include "clinic/unicodeobject.c.h"
378
Victor Stinner3d4226a2018-08-29 22:21:32 +0200379_Py_error_handler
380_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200381{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200382 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200383 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200384 }
385 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200386 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200387 }
388 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200389 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200390 }
391 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200392 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200393 }
394 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200395 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200396 }
397 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200398 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200399 }
400 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200401 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200402 }
Victor Stinner50149202015-09-22 00:26:54 +0200403 return _Py_ERROR_OTHER;
404}
405
Victor Stinner709d23d2019-05-02 14:56:30 -0400406
407static _Py_error_handler
408get_error_handler_wide(const wchar_t *errors)
409{
410 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
411 return _Py_ERROR_STRICT;
412 }
413 if (wcscmp(errors, L"surrogateescape") == 0) {
414 return _Py_ERROR_SURROGATEESCAPE;
415 }
416 if (wcscmp(errors, L"replace") == 0) {
417 return _Py_ERROR_REPLACE;
418 }
419 if (wcscmp(errors, L"ignore") == 0) {
420 return _Py_ERROR_IGNORE;
421 }
422 if (wcscmp(errors, L"backslashreplace") == 0) {
423 return _Py_ERROR_BACKSLASHREPLACE;
424 }
425 if (wcscmp(errors, L"surrogatepass") == 0) {
426 return _Py_ERROR_SURROGATEPASS;
427 }
428 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
429 return _Py_ERROR_XMLCHARREFREPLACE;
430 }
431 return _Py_ERROR_OTHER;
432}
433
434
Victor Stinner22eb6892019-06-26 00:51:05 +0200435static inline int
436unicode_check_encoding_errors(const char *encoding, const char *errors)
437{
438 if (encoding == NULL && errors == NULL) {
439 return 0;
440 }
441
Victor Stinner81a7be32020-04-14 15:14:01 +0200442 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner22eb6892019-06-26 00:51:05 +0200443#ifndef Py_DEBUG
444 /* In release mode, only check in development mode (-X dev) */
Victor Stinnerda7933e2020-04-13 03:04:28 +0200445 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200446 return 0;
447 }
448#else
449 /* Always check in debug mode */
450#endif
451
452 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
453 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
Victor Stinner3d17c042020-05-14 01:48:38 +0200454 if (!interp->unicode.fs_codec.encoding) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200455 return 0;
456 }
457
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200458 /* Disable checks during Python finalization. For example, it allows to
459 call _PyObject_Dump() during finalization for debugging purpose. */
460 if (interp->finalizing) {
461 return 0;
462 }
463
Victor Stinner22eb6892019-06-26 00:51:05 +0200464 if (encoding != NULL) {
465 PyObject *handler = _PyCodec_Lookup(encoding);
466 if (handler == NULL) {
467 return -1;
468 }
469 Py_DECREF(handler);
470 }
471
472 if (errors != NULL) {
473 PyObject *handler = PyCodec_LookupError(errors);
474 if (handler == NULL) {
475 return -1;
476 }
477 Py_DECREF(handler);
478 }
479 return 0;
480}
481
482
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200483int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100484_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200485{
Victor Stinner68762572019-10-07 18:42:01 +0200486#define CHECK(expr) \
487 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
488
Victor Stinner910337b2011-10-03 03:20:16 +0200489 PyASCIIObject *ascii;
490 unsigned int kind;
491
Victor Stinner68762572019-10-07 18:42:01 +0200492 assert(op != NULL);
493 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200494
495 ascii = (PyASCIIObject *)op;
496 kind = ascii->state.kind;
497
Victor Stinnera3b334d2011-10-03 13:53:37 +0200498 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200499 CHECK(kind == PyUnicode_1BYTE_KIND);
500 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200501 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200502 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200503 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200504 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200505
Victor Stinnera41463c2011-10-04 01:05:08 +0200506 if (ascii->state.compact == 1) {
507 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200508 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200509 || kind == PyUnicode_2BYTE_KIND
510 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200511 CHECK(ascii->state.ascii == 0);
512 CHECK(ascii->state.ready == 1);
513 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100514 }
515 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200516 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
517
518 data = unicode->data.any;
519 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200520 CHECK(ascii->length == 0);
521 CHECK(ascii->hash == -1);
522 CHECK(ascii->state.compact == 0);
523 CHECK(ascii->state.ascii == 0);
524 CHECK(ascii->state.ready == 0);
525 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
526 CHECK(ascii->wstr != NULL);
527 CHECK(data == NULL);
528 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200529 }
530 else {
Victor Stinner68762572019-10-07 18:42:01 +0200531 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200532 || kind == PyUnicode_2BYTE_KIND
533 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200534 CHECK(ascii->state.compact == 0);
535 CHECK(ascii->state.ready == 1);
536 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200537 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200538 CHECK(compact->utf8 == data);
539 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200540 }
541 else
Victor Stinner68762572019-10-07 18:42:01 +0200542 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200543 }
544 }
545 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200546 if (
547#if SIZEOF_WCHAR_T == 2
548 kind == PyUnicode_2BYTE_KIND
549#else
550 kind == PyUnicode_4BYTE_KIND
551#endif
552 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200553 {
Victor Stinner68762572019-10-07 18:42:01 +0200554 CHECK(ascii->wstr == data);
555 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200556 } else
Victor Stinner68762572019-10-07 18:42:01 +0200557 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200558 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200559
560 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200561 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200562 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200563 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200564 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200565
566 /* check that the best kind is used: O(n) operation */
567 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200568 Py_ssize_t i;
569 Py_UCS4 maxchar = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300570 const void *data;
Victor Stinner718fbf02012-04-26 00:39:37 +0200571 Py_UCS4 ch;
572
573 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200574 for (i=0; i < ascii->length; i++)
575 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200576 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200577 if (ch > maxchar)
578 maxchar = ch;
579 }
580 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100581 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200582 CHECK(maxchar >= 128);
583 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100584 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200585 else
Victor Stinner68762572019-10-07 18:42:01 +0200586 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200587 }
Victor Stinner77faf692011-11-20 18:56:05 +0100588 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200589 CHECK(maxchar >= 0x100);
590 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100591 }
592 else {
Victor Stinner68762572019-10-07 18:42:01 +0200593 CHECK(maxchar >= 0x10000);
594 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100595 }
Victor Stinner68762572019-10-07 18:42:01 +0200596 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200597 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400598 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200599
600#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400601}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200602
Victor Stinner910337b2011-10-03 03:20:16 +0200603
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100604static PyObject*
605unicode_result_wchar(PyObject *unicode)
606{
607#ifndef Py_DEBUG
608 Py_ssize_t len;
609
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100610 len = _PyUnicode_WSTR_LENGTH(unicode);
611 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100612 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200613 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100614 }
615
616 if (len == 1) {
617 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100618 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100619 Py_DECREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200620 return get_latin1_char((unsigned char)ch);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100621 }
622 }
623
624 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200625 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100626 return NULL;
627 }
628#else
Victor Stinneraa771272012-10-04 02:32:58 +0200629 assert(Py_REFCNT(unicode) == 1);
630
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100631 /* don't make the result ready in debug mode to ensure that the caller
632 makes the string ready before using it */
633 assert(_PyUnicode_CheckConsistency(unicode, 1));
634#endif
635 return unicode;
636}
637
638static PyObject*
639unicode_result_ready(PyObject *unicode)
640{
641 Py_ssize_t length;
642
643 length = PyUnicode_GET_LENGTH(unicode);
644 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200645 PyObject *empty = unicode_get_empty();
646 if (unicode != empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100647 Py_DECREF(unicode);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200648 Py_INCREF(empty);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100649 }
Victor Stinner90ed8a62020-06-24 00:34:07 +0200650 return empty;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100651 }
652
653 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200654 int kind = PyUnicode_KIND(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200655 if (kind == PyUnicode_1BYTE_KIND) {
656 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
657 Py_UCS1 ch = data[0];
658 struct _Py_unicode_state *state = get_unicode_state();
659 PyObject *latin1_char = state->latin1[ch];
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100660 if (latin1_char != NULL) {
661 if (unicode != latin1_char) {
662 Py_INCREF(latin1_char);
663 Py_DECREF(unicode);
664 }
665 return latin1_char;
666 }
667 else {
668 assert(_PyUnicode_CheckConsistency(unicode, 1));
669 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200670 state->latin1[ch] = unicode;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100671 return unicode;
672 }
673 }
Victor Stinner2f9ada92020-06-24 02:22:21 +0200674 else {
675 assert(PyUnicode_READ_CHAR(unicode, 0) >= 256);
676 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100677 }
678
679 assert(_PyUnicode_CheckConsistency(unicode, 1));
680 return unicode;
681}
682
683static PyObject*
684unicode_result(PyObject *unicode)
685{
686 assert(_PyUnicode_CHECK(unicode));
687 if (PyUnicode_IS_READY(unicode))
688 return unicode_result_ready(unicode);
689 else
690 return unicode_result_wchar(unicode);
691}
692
Victor Stinnerc4b49542011-12-11 22:44:26 +0100693static PyObject*
694unicode_result_unchanged(PyObject *unicode)
695{
696 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500697 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100698 return NULL;
699 Py_INCREF(unicode);
700 return unicode;
701 }
702 else
703 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100704 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100705}
706
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200707/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
708 ASCII, Latin1, UTF-8, etc. */
709static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200710backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200711 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
712{
Victor Stinnerad771582015-10-09 12:38:53 +0200713 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200714 Py_UCS4 ch;
715 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300716 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200717
718 assert(PyUnicode_IS_READY(unicode));
719 kind = PyUnicode_KIND(unicode);
720 data = PyUnicode_DATA(unicode);
721
722 size = 0;
723 /* determine replacement size */
724 for (i = collstart; i < collend; ++i) {
725 Py_ssize_t incr;
726
727 ch = PyUnicode_READ(kind, data, i);
728 if (ch < 0x100)
729 incr = 2+2;
730 else if (ch < 0x10000)
731 incr = 2+4;
732 else {
733 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200734 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200735 }
736 if (size > PY_SSIZE_T_MAX - incr) {
737 PyErr_SetString(PyExc_OverflowError,
738 "encoded result is too long for a Python string");
739 return NULL;
740 }
741 size += incr;
742 }
743
Victor Stinnerad771582015-10-09 12:38:53 +0200744 str = _PyBytesWriter_Prepare(writer, str, size);
745 if (str == NULL)
746 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200747
748 /* generate replacement */
749 for (i = collstart; i < collend; ++i) {
750 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200751 *str++ = '\\';
752 if (ch >= 0x00010000) {
753 *str++ = 'U';
754 *str++ = Py_hexdigits[(ch>>28)&0xf];
755 *str++ = Py_hexdigits[(ch>>24)&0xf];
756 *str++ = Py_hexdigits[(ch>>20)&0xf];
757 *str++ = Py_hexdigits[(ch>>16)&0xf];
758 *str++ = Py_hexdigits[(ch>>12)&0xf];
759 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200760 }
Victor Stinner797485e2015-10-09 03:17:30 +0200761 else if (ch >= 0x100) {
762 *str++ = 'u';
763 *str++ = Py_hexdigits[(ch>>12)&0xf];
764 *str++ = Py_hexdigits[(ch>>8)&0xf];
765 }
766 else
767 *str++ = 'x';
768 *str++ = Py_hexdigits[(ch>>4)&0xf];
769 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200770 }
771 return str;
772}
773
774/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
775 ASCII, Latin1, UTF-8, etc. */
776static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200777xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200778 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
779{
Victor Stinnerad771582015-10-09 12:38:53 +0200780 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200781 Py_UCS4 ch;
782 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300783 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200784
785 assert(PyUnicode_IS_READY(unicode));
786 kind = PyUnicode_KIND(unicode);
787 data = PyUnicode_DATA(unicode);
788
789 size = 0;
790 /* determine replacement size */
791 for (i = collstart; i < collend; ++i) {
792 Py_ssize_t incr;
793
794 ch = PyUnicode_READ(kind, data, i);
795 if (ch < 10)
796 incr = 2+1+1;
797 else if (ch < 100)
798 incr = 2+2+1;
799 else if (ch < 1000)
800 incr = 2+3+1;
801 else if (ch < 10000)
802 incr = 2+4+1;
803 else if (ch < 100000)
804 incr = 2+5+1;
805 else if (ch < 1000000)
806 incr = 2+6+1;
807 else {
808 assert(ch <= MAX_UNICODE);
809 incr = 2+7+1;
810 }
811 if (size > PY_SSIZE_T_MAX - incr) {
812 PyErr_SetString(PyExc_OverflowError,
813 "encoded result is too long for a Python string");
814 return NULL;
815 }
816 size += incr;
817 }
818
Victor Stinnerad771582015-10-09 12:38:53 +0200819 str = _PyBytesWriter_Prepare(writer, str, size);
820 if (str == NULL)
821 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200822
823 /* generate replacement */
824 for (i = collstart; i < collend; ++i) {
Christian Heimes07f2ade2020-11-18 16:38:53 +0100825 size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
826 if (size < 0) {
827 return NULL;
828 }
829 str += size;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200830 }
831 return str;
832}
833
Thomas Wouters477c8d52006-05-27 19:21:47 +0000834/* --- Bloom Filters ----------------------------------------------------- */
835
836/* stuff to implement simple "bloom filters" for Unicode characters.
837 to keep things simple, we use a single bitmask, using the least 5
838 bits from each unicode characters as the bit index. */
839
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200840/* the linebreak mask is set up by _PyUnicode_Init() below */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000841
Antoine Pitrouf068f942010-01-13 14:19:12 +0000842#if LONG_BIT >= 128
843#define BLOOM_WIDTH 128
844#elif LONG_BIT >= 64
845#define BLOOM_WIDTH 64
846#elif LONG_BIT >= 32
847#define BLOOM_WIDTH 32
848#else
849#error "LONG_BIT is smaller than 32"
850#endif
851
Thomas Wouters477c8d52006-05-27 19:21:47 +0000852#define BLOOM_MASK unsigned long
853
Serhiy Storchaka05997252013-01-26 12:14:02 +0200854static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000855
Antoine Pitrouf068f942010-01-13 14:19:12 +0000856#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000857
Benjamin Peterson29060642009-01-31 22:14:21 +0000858#define BLOOM_LINEBREAK(ch) \
859 ((ch) < 128U ? ascii_linebreak[(ch)] : \
860 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000861
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700862static inline BLOOM_MASK
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300863make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000864{
Victor Stinnera85af502013-04-09 21:53:54 +0200865#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
866 do { \
867 TYPE *data = (TYPE *)PTR; \
868 TYPE *end = data + LEN; \
869 Py_UCS4 ch; \
870 for (; data != end; data++) { \
871 ch = *data; \
872 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
873 } \
874 break; \
875 } while (0)
876
Thomas Wouters477c8d52006-05-27 19:21:47 +0000877 /* calculate simple bloom-style bitmask for a given unicode string */
878
Antoine Pitrouf068f942010-01-13 14:19:12 +0000879 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000880
881 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200882 switch (kind) {
883 case PyUnicode_1BYTE_KIND:
884 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
885 break;
886 case PyUnicode_2BYTE_KIND:
887 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
888 break;
889 case PyUnicode_4BYTE_KIND:
890 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
891 break;
892 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700893 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200894 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000895 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200896
897#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000898}
899
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300900static int
901ensure_unicode(PyObject *obj)
902{
903 if (!PyUnicode_Check(obj)) {
904 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200905 "must be str, not %.100s",
906 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300907 return -1;
908 }
909 return PyUnicode_READY(obj);
910}
911
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200912/* Compilation of templated routines */
913
Victor Stinner90ed8a62020-06-24 00:34:07 +0200914#define STRINGLIB_GET_EMPTY() unicode_get_empty()
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200915
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200916#include "stringlib/asciilib.h"
917#include "stringlib/fastsearch.h"
918#include "stringlib/partition.h"
919#include "stringlib/split.h"
920#include "stringlib/count.h"
921#include "stringlib/find.h"
922#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200923#include "stringlib/undef.h"
924
925#include "stringlib/ucs1lib.h"
926#include "stringlib/fastsearch.h"
927#include "stringlib/partition.h"
928#include "stringlib/split.h"
929#include "stringlib/count.h"
930#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300931#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200932#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200933#include "stringlib/undef.h"
934
935#include "stringlib/ucs2lib.h"
936#include "stringlib/fastsearch.h"
937#include "stringlib/partition.h"
938#include "stringlib/split.h"
939#include "stringlib/count.h"
940#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300941#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200942#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200943#include "stringlib/undef.h"
944
945#include "stringlib/ucs4lib.h"
946#include "stringlib/fastsearch.h"
947#include "stringlib/partition.h"
948#include "stringlib/split.h"
949#include "stringlib/count.h"
950#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300951#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200952#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200953#include "stringlib/undef.h"
954
Inada Naoki2c4928d2020-06-17 20:09:44 +0900955_Py_COMP_DIAG_PUSH
956_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200957#include "stringlib/unicodedefs.h"
958#include "stringlib/fastsearch.h"
959#include "stringlib/count.h"
960#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100961#include "stringlib/undef.h"
Inada Naoki2c4928d2020-06-17 20:09:44 +0900962_Py_COMP_DIAG_POP
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200963
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200964#undef STRINGLIB_GET_EMPTY
965
Guido van Rossumd57fd912000-03-10 22:53:23 +0000966/* --- Unicode Object ----------------------------------------------------- */
967
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700968static inline Py_ssize_t
969findchar(const void *s, int kind,
970 Py_ssize_t size, Py_UCS4 ch,
971 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200972{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200973 switch (kind) {
974 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200975 if ((Py_UCS1) ch != ch)
976 return -1;
977 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600978 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200979 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600980 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200981 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200982 if ((Py_UCS2) ch != ch)
983 return -1;
984 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600985 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200986 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600987 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200988 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200989 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600990 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200991 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600992 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200993 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700994 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200995 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200996}
997
Victor Stinnerafffce42012-10-03 23:03:17 +0200998#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000999/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +02001000 earlier.
1001
1002 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1003 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1004 invalid character in Unicode 6.0. */
1005static void
1006unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1007{
1008 int kind = PyUnicode_KIND(unicode);
1009 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1010 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1011 if (length <= old_length)
1012 return;
1013 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1014}
1015#endif
1016
Victor Stinnerfe226c02011-10-03 03:52:20 +02001017static PyObject*
1018resize_compact(PyObject *unicode, Py_ssize_t length)
1019{
1020 Py_ssize_t char_size;
1021 Py_ssize_t struct_size;
1022 Py_ssize_t new_size;
1023 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001024 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001025#ifdef Py_DEBUG
1026 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1027#endif
1028
Victor Stinner79891572012-05-03 13:43:07 +02001029 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001031 assert(PyUnicode_IS_COMPACT(unicode));
1032
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001033 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001034 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001035 struct_size = sizeof(PyASCIIObject);
1036 else
1037 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001038 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001039
Victor Stinnerfe226c02011-10-03 03:52:20 +02001040 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1041 PyErr_NoMemory();
1042 return NULL;
1043 }
1044 new_size = (struct_size + (length + 1) * char_size);
1045
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001046 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001047 PyObject_Free(_PyUnicode_UTF8(unicode));
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001048 _PyUnicode_UTF8(unicode) = NULL;
1049 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1050 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001051#ifdef Py_REF_DEBUG
1052 _Py_RefTotal--;
1053#endif
1054#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001055 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001056#endif
Victor Stinner84def372011-12-11 20:04:56 +01001057
Victor Stinner32bd68c2020-12-01 10:37:39 +01001058 new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001059 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001060 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001061 PyErr_NoMemory();
1062 return NULL;
1063 }
Victor Stinner84def372011-12-11 20:04:56 +01001064 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001065 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001066
Victor Stinnerfe226c02011-10-03 03:52:20 +02001067 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001068 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001069 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001070 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001071 _PyUnicode_WSTR_LENGTH(unicode) = length;
1072 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001073 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001074 PyObject_Free(_PyUnicode_WSTR(unicode));
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001075 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001076 if (!PyUnicode_IS_ASCII(unicode))
1077 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001078 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001079#ifdef Py_DEBUG
1080 unicode_fill_invalid(unicode, old_length);
1081#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001082 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1083 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001084 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001085 return unicode;
1086}
1087
Alexander Belopolsky40018472011-02-26 01:02:56 +00001088static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001089resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001090{
Victor Stinner95663112011-10-04 01:03:50 +02001091 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001092 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001093 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001094 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001095
Victor Stinnerfe226c02011-10-03 03:52:20 +02001096 if (PyUnicode_IS_READY(unicode)) {
1097 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001098 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001099 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001100#ifdef Py_DEBUG
1101 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1102#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001103
1104 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001105 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001106 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1107 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001108
1109 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1110 PyErr_NoMemory();
1111 return -1;
1112 }
1113 new_size = (length + 1) * char_size;
1114
Victor Stinner7a9105a2011-12-12 00:13:42 +01001115 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1116 {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001117 PyObject_Free(_PyUnicode_UTF8(unicode));
Victor Stinner7a9105a2011-12-12 00:13:42 +01001118 _PyUnicode_UTF8(unicode) = NULL;
1119 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1120 }
1121
Victor Stinner32bd68c2020-12-01 10:37:39 +01001122 data = (PyObject *)PyObject_Realloc(data, new_size);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001123 if (data == NULL) {
1124 PyErr_NoMemory();
1125 return -1;
1126 }
1127 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001128 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001129 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001130 _PyUnicode_WSTR_LENGTH(unicode) = length;
1131 }
1132 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001133 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001134 _PyUnicode_UTF8_LENGTH(unicode) = length;
1135 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001136 _PyUnicode_LENGTH(unicode) = length;
1137 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001138#ifdef Py_DEBUG
1139 unicode_fill_invalid(unicode, old_length);
1140#endif
Victor Stinner95663112011-10-04 01:03:50 +02001141 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001142 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001143 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001144 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001145 }
Victor Stinner95663112011-10-04 01:03:50 +02001146 assert(_PyUnicode_WSTR(unicode) != NULL);
1147
1148 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001149 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001150 PyErr_NoMemory();
1151 return -1;
1152 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001153 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001154 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner32bd68c2020-12-01 10:37:39 +01001155 wstr = PyObject_Realloc(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001156 if (!wstr) {
1157 PyErr_NoMemory();
1158 return -1;
1159 }
1160 _PyUnicode_WSTR(unicode) = wstr;
1161 _PyUnicode_WSTR(unicode)[length] = 0;
1162 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001163 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001164 return 0;
1165}
1166
Victor Stinnerfe226c02011-10-03 03:52:20 +02001167static PyObject*
1168resize_copy(PyObject *unicode, Py_ssize_t length)
1169{
1170 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001171 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001172 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001173
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001174 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001175
1176 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1177 if (copy == NULL)
1178 return NULL;
1179
1180 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001181 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001182 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001183 }
1184 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001185 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001186
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001187 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001188 if (w == NULL)
1189 return NULL;
1190 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1191 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001192 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001193 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001194 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001195 }
1196}
1197
Guido van Rossumd57fd912000-03-10 22:53:23 +00001198/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001199 Ux0000 terminated; some code (e.g. new_identifier)
1200 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001201
1202 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001203 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204
1205*/
1206
Alexander Belopolsky40018472011-02-26 01:02:56 +00001207static PyUnicodeObject *
1208_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001210 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001211 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001212
Thomas Wouters477c8d52006-05-27 19:21:47 +00001213 /* Optimization for empty strings */
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001214 if (length == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001215 return (PyUnicodeObject *)unicode_new_empty();
Guido van Rossumd57fd912000-03-10 22:53:23 +00001216 }
1217
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001218 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001219 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001220 return (PyUnicodeObject *)PyErr_NoMemory();
1221 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001222 if (length < 0) {
1223 PyErr_SetString(PyExc_SystemError,
1224 "Negative size passed to _PyUnicode_New");
1225 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001226 }
1227
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001228 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1229 if (unicode == NULL)
1230 return NULL;
1231 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001232
1233 _PyUnicode_WSTR_LENGTH(unicode) = length;
1234 _PyUnicode_HASH(unicode) = -1;
1235 _PyUnicode_STATE(unicode).interned = 0;
1236 _PyUnicode_STATE(unicode).kind = 0;
1237 _PyUnicode_STATE(unicode).compact = 0;
1238 _PyUnicode_STATE(unicode).ready = 0;
1239 _PyUnicode_STATE(unicode).ascii = 0;
1240 _PyUnicode_DATA_ANY(unicode) = NULL;
1241 _PyUnicode_LENGTH(unicode) = 0;
1242 _PyUnicode_UTF8(unicode) = NULL;
1243 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1244
Victor Stinner32bd68c2020-12-01 10:37:39 +01001245 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_Malloc(new_size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001246 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001247 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001248 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001249 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001250 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251
Jeremy Hyltond8082792003-09-16 19:41:39 +00001252 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001253 * the caller fails before initializing str -- unicode_resize()
1254 * reads str[0], and the Keep-Alive optimization can keep memory
1255 * allocated for str alive across a call to unicode_dealloc(unicode).
1256 * We don't want unicode_resize to read uninitialized memory in
1257 * that case.
1258 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001259 _PyUnicode_WSTR(unicode)[0] = 0;
1260 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001261
Victor Stinner7931d9a2011-11-04 00:22:48 +01001262 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001263 return unicode;
1264}
1265
Victor Stinnerf42dc442011-10-02 23:33:16 +02001266static const char*
1267unicode_kind_name(PyObject *unicode)
1268{
Victor Stinner42dfd712011-10-03 14:41:45 +02001269 /* don't check consistency: unicode_kind_name() is called from
1270 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001271 if (!PyUnicode_IS_COMPACT(unicode))
1272 {
1273 if (!PyUnicode_IS_READY(unicode))
1274 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001275 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001276 {
1277 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001278 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001279 return "legacy ascii";
1280 else
1281 return "legacy latin1";
1282 case PyUnicode_2BYTE_KIND:
1283 return "legacy UCS2";
1284 case PyUnicode_4BYTE_KIND:
1285 return "legacy UCS4";
1286 default:
1287 return "<legacy invalid kind>";
1288 }
1289 }
1290 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001291 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001292 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001293 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001294 return "ascii";
1295 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001296 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001297 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001298 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001299 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001300 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001301 default:
1302 return "<invalid compact kind>";
1303 }
1304}
1305
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001307/* Functions wrapping macros for use in debugger */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001308const char *_PyUnicode_utf8(void *unicode_raw){
Victor Stinnera42de742018-11-22 10:25:22 +01001309 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001310 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311}
1312
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001313const void *_PyUnicode_compact_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001314 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001315 return _PyUnicode_COMPACT_DATA(unicode);
1316}
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001317const void *_PyUnicode_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001318 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001319 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1321 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1322 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1323 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1324 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1325 return PyUnicode_DATA(unicode);
1326}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001327
1328void
1329_PyUnicode_Dump(PyObject *op)
1330{
1331 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001332 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1333 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001334 const void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001335
Victor Stinnera849a4b2011-10-03 12:12:11 +02001336 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001337 {
1338 if (ascii->state.ascii)
1339 data = (ascii + 1);
1340 else
1341 data = (compact + 1);
1342 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001343 else
1344 data = unicode->data.any;
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001345 printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001346
Victor Stinnera849a4b2011-10-03 12:12:11 +02001347 if (ascii->wstr == data)
1348 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001349 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001350
Victor Stinnera3b334d2011-10-03 13:53:37 +02001351 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001352 printf(" (%zu), ", compact->wstr_length);
1353 if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001354 printf("shared ");
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001355 }
1356 printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001357 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001358 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001359}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001360#endif
1361
Victor Stinner91698d82020-06-25 14:07:40 +02001362static int
1363unicode_create_empty_string_singleton(struct _Py_unicode_state *state)
1364{
1365 // Use size=1 rather than size=0, so PyUnicode_New(0, maxchar) can be
1366 // optimized to always use state->empty_string without having to check if
1367 // it is NULL or not.
1368 PyObject *empty = PyUnicode_New(1, 0);
1369 if (empty == NULL) {
1370 return -1;
1371 }
1372 PyUnicode_1BYTE_DATA(empty)[0] = 0;
1373 _PyUnicode_LENGTH(empty) = 0;
1374 assert(_PyUnicode_CheckConsistency(empty, 1));
1375
1376 assert(state->empty_string == NULL);
1377 state->empty_string = empty;
1378 return 0;
1379}
1380
1381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001382PyObject *
1383PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1384{
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001385 /* Optimization for empty strings */
1386 if (size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001387 return unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001388 }
1389
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001390 PyObject *obj;
1391 PyCompactUnicodeObject *unicode;
1392 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001393 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001394 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395 Py_ssize_t char_size;
1396 Py_ssize_t struct_size;
1397
Victor Stinner9e9d6892011-10-04 01:02:02 +02001398 is_ascii = 0;
1399 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400 struct_size = sizeof(PyCompactUnicodeObject);
1401 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001402 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001403 char_size = 1;
1404 is_ascii = 1;
1405 struct_size = sizeof(PyASCIIObject);
1406 }
1407 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001408 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 char_size = 1;
1410 }
1411 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001412 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001413 char_size = 2;
1414 if (sizeof(wchar_t) == 2)
1415 is_sharing = 1;
1416 }
1417 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001418 if (maxchar > MAX_UNICODE) {
1419 PyErr_SetString(PyExc_SystemError,
1420 "invalid maximum character passed to PyUnicode_New");
1421 return NULL;
1422 }
Victor Stinner8f825062012-04-27 13:55:39 +02001423 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001424 char_size = 4;
1425 if (sizeof(wchar_t) == 4)
1426 is_sharing = 1;
1427 }
1428
1429 /* Ensure we won't overflow the size. */
1430 if (size < 0) {
1431 PyErr_SetString(PyExc_SystemError,
1432 "Negative size passed to PyUnicode_New");
1433 return NULL;
1434 }
1435 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1436 return PyErr_NoMemory();
1437
1438 /* Duplicated allocation code from _PyObject_New() instead of a call to
1439 * PyObject_New() so we are able to allocate space for the object and
1440 * it's data buffer.
1441 */
Victor Stinner32bd68c2020-12-01 10:37:39 +01001442 obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
Victor Stinner04fc4f22020-06-16 01:28:07 +02001443 if (obj == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001444 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02001445 }
1446 _PyObject_Init(obj, &PyUnicode_Type);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001447
1448 unicode = (PyCompactUnicodeObject *)obj;
1449 if (is_ascii)
1450 data = ((PyASCIIObject*)obj) + 1;
1451 else
1452 data = unicode + 1;
1453 _PyUnicode_LENGTH(unicode) = size;
1454 _PyUnicode_HASH(unicode) = -1;
1455 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001456 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 _PyUnicode_STATE(unicode).compact = 1;
1458 _PyUnicode_STATE(unicode).ready = 1;
1459 _PyUnicode_STATE(unicode).ascii = is_ascii;
1460 if (is_ascii) {
1461 ((char*)data)[size] = 0;
1462 _PyUnicode_WSTR(unicode) = NULL;
1463 }
Victor Stinner8f825062012-04-27 13:55:39 +02001464 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001465 ((char*)data)[size] = 0;
1466 _PyUnicode_WSTR(unicode) = NULL;
1467 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001468 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001469 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001470 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471 else {
1472 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001473 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001474 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001475 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001476 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001477 ((Py_UCS4*)data)[size] = 0;
1478 if (is_sharing) {
1479 _PyUnicode_WSTR_LENGTH(unicode) = size;
1480 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1481 }
1482 else {
1483 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1484 _PyUnicode_WSTR(unicode) = NULL;
1485 }
1486 }
Victor Stinner8f825062012-04-27 13:55:39 +02001487#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001488 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001489#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001490 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001491 return obj;
1492}
1493
1494#if SIZEOF_WCHAR_T == 2
1495/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1496 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001497 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001498
1499 This function assumes that unicode can hold one more code point than wstr
1500 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001501static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001502unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001503 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504{
1505 const wchar_t *iter;
1506 Py_UCS4 *ucs4_out;
1507
Victor Stinner910337b2011-10-03 03:20:16 +02001508 assert(unicode != NULL);
1509 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001510 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1511 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1512
1513 for (iter = begin; iter < end; ) {
1514 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1515 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001516 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1517 && (iter+1) < end
1518 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001519 {
Victor Stinner551ac952011-11-29 22:58:13 +01001520 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001521 iter += 2;
1522 }
1523 else {
1524 *ucs4_out++ = *iter;
1525 iter++;
1526 }
1527 }
1528 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1529 _PyUnicode_GET_LENGTH(unicode)));
1530
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001531}
1532#endif
1533
Victor Stinnercd9950f2011-10-02 00:34:53 +02001534static int
Victor Stinner488fa492011-12-12 00:01:39 +01001535unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001536{
Victor Stinner488fa492011-12-12 00:01:39 +01001537 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001538 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001539 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001540 return -1;
1541 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001542 return 0;
1543}
1544
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001545static int
1546_copy_characters(PyObject *to, Py_ssize_t to_start,
1547 PyObject *from, Py_ssize_t from_start,
1548 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001549{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001550 unsigned int from_kind, to_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001551 const void *from_data;
1552 void *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001553
Victor Stinneree4544c2012-05-09 22:24:08 +02001554 assert(0 <= how_many);
1555 assert(0 <= from_start);
1556 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001557 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001558 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001559 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001560
Victor Stinnerd3f08822012-05-29 12:57:52 +02001561 assert(PyUnicode_Check(to));
1562 assert(PyUnicode_IS_READY(to));
1563 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1564
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001565 if (how_many == 0)
1566 return 0;
1567
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001568 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001569 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001570 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001571 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001572
Victor Stinnerf1852262012-06-16 16:38:26 +02001573#ifdef Py_DEBUG
1574 if (!check_maxchar
1575 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1576 {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001577 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerf1852262012-06-16 16:38:26 +02001578 Py_UCS4 ch;
1579 Py_ssize_t i;
1580 for (i=0; i < how_many; i++) {
1581 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1582 assert(ch <= to_maxchar);
1583 }
1584 }
1585#endif
1586
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001587 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001588 if (check_maxchar
1589 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1590 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001591 /* Writing Latin-1 characters into an ASCII string requires to
1592 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001593 Py_UCS4 max_char;
1594 max_char = ucs1lib_find_max_char(from_data,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001595 (const Py_UCS1*)from_data + how_many);
Victor Stinnerf1852262012-06-16 16:38:26 +02001596 if (max_char >= 128)
1597 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001598 }
Christian Heimesf051e432016-09-13 20:22:02 +02001599 memcpy((char*)to_data + to_kind * to_start,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001600 (const char*)from_data + from_kind * from_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001601 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001602 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001603 else if (from_kind == PyUnicode_1BYTE_KIND
1604 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001605 {
1606 _PyUnicode_CONVERT_BYTES(
1607 Py_UCS1, Py_UCS2,
1608 PyUnicode_1BYTE_DATA(from) + from_start,
1609 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1610 PyUnicode_2BYTE_DATA(to) + to_start
1611 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001612 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001613 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001614 && to_kind == PyUnicode_4BYTE_KIND)
1615 {
1616 _PyUnicode_CONVERT_BYTES(
1617 Py_UCS1, Py_UCS4,
1618 PyUnicode_1BYTE_DATA(from) + from_start,
1619 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1620 PyUnicode_4BYTE_DATA(to) + to_start
1621 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001622 }
1623 else if (from_kind == PyUnicode_2BYTE_KIND
1624 && to_kind == PyUnicode_4BYTE_KIND)
1625 {
1626 _PyUnicode_CONVERT_BYTES(
1627 Py_UCS2, Py_UCS4,
1628 PyUnicode_2BYTE_DATA(from) + from_start,
1629 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1630 PyUnicode_4BYTE_DATA(to) + to_start
1631 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001632 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001633 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001634 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1635
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001636 if (!check_maxchar) {
1637 if (from_kind == PyUnicode_2BYTE_KIND
1638 && to_kind == PyUnicode_1BYTE_KIND)
1639 {
1640 _PyUnicode_CONVERT_BYTES(
1641 Py_UCS2, Py_UCS1,
1642 PyUnicode_2BYTE_DATA(from) + from_start,
1643 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1644 PyUnicode_1BYTE_DATA(to) + to_start
1645 );
1646 }
1647 else if (from_kind == PyUnicode_4BYTE_KIND
1648 && to_kind == PyUnicode_1BYTE_KIND)
1649 {
1650 _PyUnicode_CONVERT_BYTES(
1651 Py_UCS4, Py_UCS1,
1652 PyUnicode_4BYTE_DATA(from) + from_start,
1653 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1654 PyUnicode_1BYTE_DATA(to) + to_start
1655 );
1656 }
1657 else if (from_kind == PyUnicode_4BYTE_KIND
1658 && to_kind == PyUnicode_2BYTE_KIND)
1659 {
1660 _PyUnicode_CONVERT_BYTES(
1661 Py_UCS4, Py_UCS2,
1662 PyUnicode_4BYTE_DATA(from) + from_start,
1663 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1664 PyUnicode_2BYTE_DATA(to) + to_start
1665 );
1666 }
1667 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001668 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001669 }
1670 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001671 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001672 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001673 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001674 Py_ssize_t i;
1675
Victor Stinnera0702ab2011-09-29 14:14:38 +02001676 for (i=0; i < how_many; i++) {
1677 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001678 if (ch > to_maxchar)
1679 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001680 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1681 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001682 }
1683 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001684 return 0;
1685}
1686
Victor Stinnerd3f08822012-05-29 12:57:52 +02001687void
1688_PyUnicode_FastCopyCharacters(
1689 PyObject *to, Py_ssize_t to_start,
1690 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001691{
1692 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1693}
1694
1695Py_ssize_t
1696PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1697 PyObject *from, Py_ssize_t from_start,
1698 Py_ssize_t how_many)
1699{
1700 int err;
1701
1702 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1703 PyErr_BadInternalCall();
1704 return -1;
1705 }
1706
Benjamin Petersonbac79492012-01-14 13:34:47 -05001707 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001708 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001709 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001710 return -1;
1711
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001712 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001713 PyErr_SetString(PyExc_IndexError, "string index out of range");
1714 return -1;
1715 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001716 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001717 PyErr_SetString(PyExc_IndexError, "string index out of range");
1718 return -1;
1719 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001720 if (how_many < 0) {
1721 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1722 return -1;
1723 }
1724 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001725 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1726 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001727 "Cannot write %zi characters at %zi "
1728 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001729 how_many, to_start, PyUnicode_GET_LENGTH(to));
1730 return -1;
1731 }
1732
1733 if (how_many == 0)
1734 return 0;
1735
Victor Stinner488fa492011-12-12 00:01:39 +01001736 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001737 return -1;
1738
1739 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1740 if (err) {
1741 PyErr_Format(PyExc_SystemError,
1742 "Cannot copy %s characters "
1743 "into a string of %s characters",
1744 unicode_kind_name(from),
1745 unicode_kind_name(to));
1746 return -1;
1747 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001748 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749}
1750
Victor Stinner17222162011-09-28 22:15:37 +02001751/* Find the maximum code point and count the number of surrogate pairs so a
1752 correct string length can be computed before converting a string to UCS4.
1753 This function counts single surrogates as a character and not as a pair.
1754
1755 Return 0 on success, or -1 on error. */
1756static int
1757find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1758 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001759{
1760 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001761 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762
Victor Stinnerc53be962011-10-02 21:33:54 +02001763 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001764 *num_surrogates = 0;
1765 *maxchar = 0;
1766
1767 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001768#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001769 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1770 && (iter+1) < end
1771 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1772 {
1773 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1774 ++(*num_surrogates);
1775 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001776 }
1777 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001778#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001779 {
1780 ch = *iter;
1781 iter++;
1782 }
1783 if (ch > *maxchar) {
1784 *maxchar = ch;
1785 if (*maxchar > MAX_UNICODE) {
1786 PyErr_Format(PyExc_ValueError,
1787 "character U+%x is not in range [U+0000; U+10ffff]",
1788 ch);
1789 return -1;
1790 }
1791 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001792 }
1793 return 0;
1794}
1795
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001796int
1797_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001798{
1799 wchar_t *end;
1800 Py_UCS4 maxchar = 0;
1801 Py_ssize_t num_surrogates;
1802#if SIZEOF_WCHAR_T == 2
1803 Py_ssize_t length_wo_surrogates;
1804#endif
1805
Georg Brandl7597add2011-10-05 16:36:47 +02001806 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001807 strings were created using _PyObject_New() and where no canonical
1808 representation (the str field) has been set yet aka strings
1809 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001810 assert(_PyUnicode_CHECK(unicode));
1811 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001812 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001813 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001814 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001815 /* Actually, it should neither be interned nor be anything else: */
1816 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001817
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001818 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001819 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001820 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001821 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001822
1823 if (maxchar < 256) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001824 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(_PyUnicode_WSTR_LENGTH(unicode) + 1);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001825 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001826 PyErr_NoMemory();
1827 return -1;
1828 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001829 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001830 _PyUnicode_WSTR(unicode), end,
1831 PyUnicode_1BYTE_DATA(unicode));
1832 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1833 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1834 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1835 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001836 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001837 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001838 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839 }
1840 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001841 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001842 _PyUnicode_UTF8(unicode) = NULL;
1843 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001844 }
Victor Stinner32bd68c2020-12-01 10:37:39 +01001845 PyObject_Free(_PyUnicode_WSTR(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001846 _PyUnicode_WSTR(unicode) = NULL;
1847 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1848 }
1849 /* In this case we might have to convert down from 4-byte native
1850 wchar_t to 2-byte unicode. */
1851 else if (maxchar < 65536) {
1852 assert(num_surrogates == 0 &&
1853 "FindMaxCharAndNumSurrogatePairs() messed up");
1854
Victor Stinner506f5922011-09-28 22:34:18 +02001855#if SIZEOF_WCHAR_T == 2
1856 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001857 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001858 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1859 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1860 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001861 _PyUnicode_UTF8(unicode) = NULL;
1862 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001863#else
1864 /* sizeof(wchar_t) == 4 */
Victor Stinner32bd68c2020-12-01 10:37:39 +01001865 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(
Victor Stinner506f5922011-09-28 22:34:18 +02001866 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001867 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001868 PyErr_NoMemory();
1869 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001870 }
Victor Stinner506f5922011-09-28 22:34:18 +02001871 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1872 _PyUnicode_WSTR(unicode), end,
1873 PyUnicode_2BYTE_DATA(unicode));
1874 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1875 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1876 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001877 _PyUnicode_UTF8(unicode) = NULL;
1878 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner32bd68c2020-12-01 10:37:39 +01001879 PyObject_Free(_PyUnicode_WSTR(unicode));
Victor Stinner506f5922011-09-28 22:34:18 +02001880 _PyUnicode_WSTR(unicode) = NULL;
1881 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1882#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001883 }
Ikko Ashimine38811d62020-11-10 14:57:34 +09001884 /* maxchar exceeds 16 bit, wee need 4 bytes for unicode characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001885 else {
1886#if SIZEOF_WCHAR_T == 2
1887 /* in case the native representation is 2-bytes, we need to allocate a
1888 new normalized 4-byte version. */
1889 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001890 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1891 PyErr_NoMemory();
1892 return -1;
1893 }
Victor Stinner32bd68c2020-12-01 10:37:39 +01001894 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(4 * (length_wo_surrogates + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001895 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001896 PyErr_NoMemory();
1897 return -1;
1898 }
1899 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1900 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001901 _PyUnicode_UTF8(unicode) = NULL;
1902 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001903 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1904 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001905 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Victor Stinner32bd68c2020-12-01 10:37:39 +01001906 PyObject_Free(_PyUnicode_WSTR(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001907 _PyUnicode_WSTR(unicode) = NULL;
1908 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1909#else
1910 assert(num_surrogates == 0);
1911
Victor Stinnerc3c74152011-10-02 20:39:55 +02001912 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001913 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001914 _PyUnicode_UTF8(unicode) = NULL;
1915 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001916 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1917#endif
1918 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1919 }
1920 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001921 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001922 return 0;
1923}
1924
Alexander Belopolsky40018472011-02-26 01:02:56 +00001925static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001926unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001927{
Walter Dörwald16807132007-05-25 13:52:07 +00001928 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001929 case SSTATE_NOT_INTERNED:
1930 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001931
Benjamin Peterson29060642009-01-31 22:14:21 +00001932 case SSTATE_INTERNED_MORTAL:
Victor Stinnerea251802020-12-26 02:58:33 +01001933 {
1934 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner3549ca32020-07-03 16:59:12 +02001935 /* Revive the dead object temporarily. PyDict_DelItem() removes two
1936 references (key and value) which were ignored by
1937 PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2
1938 to prevent calling unicode_dealloc() again. Adjust refcnt after
1939 PyDict_DelItem(). */
1940 assert(Py_REFCNT(unicode) == 0);
1941 Py_SET_REFCNT(unicode, 3);
Victor Stinnerea251802020-12-26 02:58:33 +01001942 if (PyDict_DelItem(state->interned, unicode) != 0) {
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001943 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1944 NULL);
1945 }
Victor Stinner3549ca32020-07-03 16:59:12 +02001946 assert(Py_REFCNT(unicode) == 1);
1947 Py_SET_REFCNT(unicode, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001948 break;
Victor Stinnerea251802020-12-26 02:58:33 +01001949 }
Walter Dörwald16807132007-05-25 13:52:07 +00001950
Benjamin Peterson29060642009-01-31 22:14:21 +00001951 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001952 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1953 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001954
Benjamin Peterson29060642009-01-31 22:14:21 +00001955 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001956 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001957 }
1958
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001959 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001960 PyObject_Free(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001961 }
1962 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001963 PyObject_Free(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001964 }
1965 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001966 PyObject_Free(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001967 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001968
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001969 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001970}
1971
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001972#ifdef Py_DEBUG
1973static int
1974unicode_is_singleton(PyObject *unicode)
1975{
Victor Stinner2f9ada92020-06-24 02:22:21 +02001976 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner91698d82020-06-25 14:07:40 +02001977 if (unicode == state->empty_string) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001978 return 1;
Victor Stinner607b1022020-05-05 18:50:30 +02001979 }
Victor Stinner607b1022020-05-05 18:50:30 +02001980 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001981 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1982 {
1983 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02001984 if (ch < 256 && state->latin1[ch] == unicode) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001985 return 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02001986 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001987 }
1988 return 0;
1989}
1990#endif
1991
Alexander Belopolsky40018472011-02-26 01:02:56 +00001992static int
Victor Stinner488fa492011-12-12 00:01:39 +01001993unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001994{
Victor Stinner488fa492011-12-12 00:01:39 +01001995 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001996 if (Py_REFCNT(unicode) != 1)
1997 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001998 if (_PyUnicode_HASH(unicode) != -1)
1999 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002000 if (PyUnicode_CHECK_INTERNED(unicode))
2001 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002002 if (!PyUnicode_CheckExact(unicode))
2003 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02002004#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002005 /* singleton refcount is greater than 1 */
2006 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02002007#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02002008 return 1;
2009}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002010
Victor Stinnerfe226c02011-10-03 03:52:20 +02002011static int
2012unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2013{
2014 PyObject *unicode;
2015 Py_ssize_t old_length;
2016
2017 assert(p_unicode != NULL);
2018 unicode = *p_unicode;
2019
2020 assert(unicode != NULL);
2021 assert(PyUnicode_Check(unicode));
2022 assert(0 <= length);
2023
Victor Stinner910337b2011-10-03 03:20:16 +02002024 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002025 old_length = PyUnicode_WSTR_LENGTH(unicode);
2026 else
2027 old_length = PyUnicode_GET_LENGTH(unicode);
2028 if (old_length == length)
2029 return 0;
2030
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002031 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002032 PyObject *empty = unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002033 Py_SETREF(*p_unicode, empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002034 return 0;
2035 }
2036
Victor Stinner488fa492011-12-12 00:01:39 +01002037 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002038 PyObject *copy = resize_copy(unicode, length);
2039 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002040 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002041 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002042 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002043 }
2044
Victor Stinnerfe226c02011-10-03 03:52:20 +02002045 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002046 PyObject *new_unicode = resize_compact(unicode, length);
2047 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002048 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002049 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002050 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002051 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002052 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002053}
2054
Alexander Belopolsky40018472011-02-26 01:02:56 +00002055int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002056PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002057{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002058 PyObject *unicode;
2059 if (p_unicode == NULL) {
2060 PyErr_BadInternalCall();
2061 return -1;
2062 }
2063 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002064 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002065 {
2066 PyErr_BadInternalCall();
2067 return -1;
2068 }
2069 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002070}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002071
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002072/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002073
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002074 WARNING: The function doesn't copy the terminating null character and
2075 doesn't check the maximum character (may write a latin1 character in an
2076 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002077static void
2078unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2079 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002080{
2081 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002082 const void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002083 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002084
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002085 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002086 switch (kind) {
2087 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002088#ifdef Py_DEBUG
2089 if (PyUnicode_IS_ASCII(unicode)) {
2090 Py_UCS4 maxchar = ucs1lib_find_max_char(
2091 (const Py_UCS1*)str,
2092 (const Py_UCS1*)str + len);
2093 assert(maxchar < 128);
2094 }
2095#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002096 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002097 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002098 }
2099 case PyUnicode_2BYTE_KIND: {
2100 Py_UCS2 *start = (Py_UCS2 *)data + index;
2101 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002102
Victor Stinner184252a2012-06-16 02:57:41 +02002103 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002104 *ucs2 = (Py_UCS2)*str;
2105
2106 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002107 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002108 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002109 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002110 Py_UCS4 *start = (Py_UCS4 *)data + index;
2111 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002112
Victor Stinner184252a2012-06-16 02:57:41 +02002113 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002114 *ucs4 = (Py_UCS4)*str;
2115
2116 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002117 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002118 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002119 default:
2120 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002121 }
2122}
2123
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002124static PyObject*
Victor Stinner2f9ada92020-06-24 02:22:21 +02002125get_latin1_char(Py_UCS1 ch)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002126{
Victor Stinner2f9ada92020-06-24 02:22:21 +02002127 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner607b1022020-05-05 18:50:30 +02002128
Victor Stinner2f9ada92020-06-24 02:22:21 +02002129 PyObject *unicode = state->latin1[ch];
Victor Stinner607b1022020-05-05 18:50:30 +02002130 if (unicode) {
2131 Py_INCREF(unicode);
2132 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002133 }
Victor Stinner607b1022020-05-05 18:50:30 +02002134
2135 unicode = PyUnicode_New(1, ch);
2136 if (!unicode) {
2137 return NULL;
2138 }
2139
2140 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2141 assert(_PyUnicode_CheckConsistency(unicode, 1));
2142
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002143 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002144 state->latin1[ch] = unicode;
Victor Stinnera464fc12011-10-02 20:39:30 +02002145 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002146}
2147
Victor Stinner985a82a2014-01-03 12:53:47 +01002148static PyObject*
2149unicode_char(Py_UCS4 ch)
2150{
2151 PyObject *unicode;
2152
2153 assert(ch <= MAX_UNICODE);
2154
Victor Stinner2f9ada92020-06-24 02:22:21 +02002155 if (ch < 256) {
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002156 return get_latin1_char(ch);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002157 }
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002158
Victor Stinner985a82a2014-01-03 12:53:47 +01002159 unicode = PyUnicode_New(1, ch);
2160 if (unicode == NULL)
2161 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002162
2163 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2164 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002165 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002166 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002167 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2168 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2169 }
2170 assert(_PyUnicode_CheckConsistency(unicode, 1));
2171 return unicode;
2172}
2173
Alexander Belopolsky40018472011-02-26 01:02:56 +00002174PyObject *
2175PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002176{
Inada Naoki038dd0f2020-06-30 15:26:56 +09002177 if (u == NULL) {
2178 if (size > 0) {
2179 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2180 "PyUnicode_FromUnicode(NULL, size) is deprecated; "
2181 "use PyUnicode_New() instead", 1) < 0) {
2182 return NULL;
2183 }
2184 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002185 return (PyObject*)_PyUnicode_New(size);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002186 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002187
2188 if (size < 0) {
2189 PyErr_BadInternalCall();
2190 return NULL;
2191 }
2192
2193 return PyUnicode_FromWideChar(u, size);
2194}
2195
2196PyObject *
2197PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2198{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002199 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002200 Py_UCS4 maxchar = 0;
2201 Py_ssize_t num_surrogates;
2202
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002203 if (u == NULL && size != 0) {
2204 PyErr_BadInternalCall();
2205 return NULL;
2206 }
2207
2208 if (size == -1) {
2209 size = wcslen(u);
2210 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002212 /* If the Unicode data is known at construction time, we can apply
2213 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002214
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002215 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002216 if (size == 0)
2217 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002219 /* Single character Unicode objects in the Latin-1 range are
2220 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002221 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002222 return get_latin1_char((unsigned char)*u);
2223
2224 /* If not empty and not single character, copy the Unicode data
2225 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002226 if (find_maxchar_surrogates(u, u + size,
2227 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002228 return NULL;
2229
Victor Stinner8faf8212011-12-08 22:14:11 +01002230 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002231 if (!unicode)
2232 return NULL;
2233
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002234 switch (PyUnicode_KIND(unicode)) {
2235 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002236 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002237 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2238 break;
2239 case PyUnicode_2BYTE_KIND:
2240#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002241 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002242#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002243 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002244 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2245#endif
2246 break;
2247 case PyUnicode_4BYTE_KIND:
2248#if SIZEOF_WCHAR_T == 2
2249 /* This is the only case which has to process surrogates, thus
2250 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002251 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002252#else
2253 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002254 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002255#endif
2256 break;
2257 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002258 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002259 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002260
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002261 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002262}
2263
Alexander Belopolsky40018472011-02-26 01:02:56 +00002264PyObject *
2265PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002266{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002267 if (size < 0) {
2268 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002269 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002270 return NULL;
2271 }
Inada Naoki038dd0f2020-06-30 15:26:56 +09002272 if (u != NULL) {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002273 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002274 }
2275 else {
2276 if (size > 0) {
2277 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2278 "PyUnicode_FromStringAndSize(NULL, size) is deprecated; "
2279 "use PyUnicode_New() instead", 1) < 0) {
2280 return NULL;
2281 }
2282 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002283 return (PyObject *)_PyUnicode_New(size);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002284 }
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002285}
2286
Alexander Belopolsky40018472011-02-26 01:02:56 +00002287PyObject *
2288PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002289{
2290 size_t size = strlen(u);
2291 if (size > PY_SSIZE_T_MAX) {
2292 PyErr_SetString(PyExc_OverflowError, "input too long");
2293 return NULL;
2294 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002295 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002296}
2297
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002298
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002299PyObject *
2300_PyUnicode_FromId(_Py_Identifier *id)
2301{
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002302 PyInterpreterState *interp = _PyInterpreterState_GET();
2303 struct _Py_unicode_ids *ids = &interp->unicode.ids;
2304
2305 int index = _Py_atomic_size_get(&id->index);
2306 if (index < 0) {
2307 struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_ids;
2308
2309 PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK);
2310 // Check again to detect concurrent access. Another thread can have
2311 // initialized the index while this thread waited for the lock.
2312 index = _Py_atomic_size_get(&id->index);
2313 if (index < 0) {
2314 assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2315 index = rt_ids->next_index;
2316 rt_ids->next_index++;
2317 _Py_atomic_size_set(&id->index, index);
2318 }
2319 PyThread_release_lock(rt_ids->lock);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002320 }
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002321 assert(index >= 0);
Victor Stinner297257f2020-06-02 14:39:45 +02002322
2323 PyObject *obj;
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002324 if (index < ids->size) {
2325 obj = ids->array[index];
2326 if (obj) {
2327 // Return a borrowed reference
2328 return obj;
2329 }
2330 }
2331
2332 obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
Victor Stinner297257f2020-06-02 14:39:45 +02002333 NULL, NULL);
2334 if (!obj) {
2335 return NULL;
2336 }
2337 PyUnicode_InternInPlace(&obj);
2338
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002339 if (index >= ids->size) {
2340 // Overallocate to reduce the number of realloc
2341 Py_ssize_t new_size = Py_MAX(index * 2, 16);
2342 Py_ssize_t item_size = sizeof(ids->array[0]);
2343 PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2344 if (new_array == NULL) {
2345 PyErr_NoMemory();
2346 return NULL;
2347 }
2348 memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2349 ids->array = new_array;
2350 ids->size = new_size;
2351 }
2352
2353 // The array stores a strong reference
2354 ids->array[index] = obj;
2355
2356 // Return a borrowed reference
2357 return obj;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002358}
2359
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002360
Victor Stinnerd6fb53f2020-05-14 01:11:54 +02002361static void
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002362unicode_clear_identifiers(PyThreadState *tstate)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002363{
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002364 PyInterpreterState *interp = _PyInterpreterState_GET();
2365 struct _Py_unicode_ids *ids = &interp->unicode.ids;
2366 for (Py_ssize_t i=0; i < ids->size; i++) {
2367 Py_XDECREF(ids->array[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002368 }
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002369 ids->size = 0;
2370 PyMem_Free(ids->array);
2371 ids->array = NULL;
2372 // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2373 // after Py_Finalize().
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002374}
2375
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002376
Benjamin Peterson0df54292012-03-26 14:50:32 -04002377/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002378
Victor Stinnerd3f08822012-05-29 12:57:52 +02002379PyObject*
2380_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002381{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002382 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002383 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002384 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002385#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002386 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002387#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002388 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002389 }
Victor Stinner785938e2011-12-11 20:09:03 +01002390 unicode = PyUnicode_New(size, 127);
2391 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002392 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002393 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2394 assert(_PyUnicode_CheckConsistency(unicode, 1));
2395 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002396}
2397
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002398static Py_UCS4
2399kind_maxchar_limit(unsigned int kind)
2400{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002401 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002402 case PyUnicode_1BYTE_KIND:
2403 return 0x80;
2404 case PyUnicode_2BYTE_KIND:
2405 return 0x100;
2406 case PyUnicode_4BYTE_KIND:
2407 return 0x10000;
2408 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002409 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002410 }
2411}
2412
Victor Stinner702c7342011-10-05 13:50:52 +02002413static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002414_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002415{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002416 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002417 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002418
Victor Stinner2f9ada92020-06-24 02:22:21 +02002419 if (size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02002420 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner2f9ada92020-06-24 02:22:21 +02002421 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002422 assert(size > 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002423 if (size == 1) {
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002424 return get_latin1_char(u[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002425 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002426
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002427 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002428 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002429 if (!res)
2430 return NULL;
2431 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002432 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002433 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002434}
2435
Victor Stinnere57b1c02011-09-28 22:20:48 +02002436static PyObject*
2437_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002438{
2439 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002440 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002441
Serhiy Storchaka678db842013-01-26 12:16:36 +02002442 if (size == 0)
2443 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002444 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002445 if (size == 1)
2446 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002447
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002448 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002449 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002450 if (!res)
2451 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002452 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002453 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002454 else {
2455 _PyUnicode_CONVERT_BYTES(
2456 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2457 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002458 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002459 return res;
2460}
2461
Victor Stinnere57b1c02011-09-28 22:20:48 +02002462static PyObject*
2463_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002464{
2465 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002466 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002467
Serhiy Storchaka678db842013-01-26 12:16:36 +02002468 if (size == 0)
2469 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002470 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002471 if (size == 1)
2472 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002473
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002474 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002475 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002476 if (!res)
2477 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002478 if (max_char < 256)
2479 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2480 PyUnicode_1BYTE_DATA(res));
2481 else if (max_char < 0x10000)
2482 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2483 PyUnicode_2BYTE_DATA(res));
2484 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002485 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002486 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002487 return res;
2488}
2489
2490PyObject*
2491PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2492{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002493 if (size < 0) {
2494 PyErr_SetString(PyExc_ValueError, "size must be positive");
2495 return NULL;
2496 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002497 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002498 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002499 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002500 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002501 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002502 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002503 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002504 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002505 PyErr_SetString(PyExc_SystemError, "invalid kind");
2506 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002507 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002508}
2509
Victor Stinnerece58de2012-04-23 23:36:38 +02002510Py_UCS4
2511_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2512{
2513 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002514 const void *startptr, *endptr;
Victor Stinnerece58de2012-04-23 23:36:38 +02002515
2516 assert(PyUnicode_IS_READY(unicode));
2517 assert(0 <= start);
2518 assert(end <= PyUnicode_GET_LENGTH(unicode));
2519 assert(start <= end);
2520
2521 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2522 return PyUnicode_MAX_CHAR_VALUE(unicode);
2523
2524 if (start == end)
2525 return 127;
2526
Victor Stinner94d558b2012-04-27 22:26:58 +02002527 if (PyUnicode_IS_ASCII(unicode))
2528 return 127;
2529
Victor Stinnerece58de2012-04-23 23:36:38 +02002530 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002531 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002532 endptr = (char *)startptr + end * kind;
2533 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002534 switch(kind) {
2535 case PyUnicode_1BYTE_KIND:
2536 return ucs1lib_find_max_char(startptr, endptr);
2537 case PyUnicode_2BYTE_KIND:
2538 return ucs2lib_find_max_char(startptr, endptr);
2539 case PyUnicode_4BYTE_KIND:
2540 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002541 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002542 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002543 }
2544}
2545
Victor Stinner25a4b292011-10-06 12:31:55 +02002546/* Ensure that a string uses the most efficient storage, if it is not the
2547 case: create a new string with of the right kind. Write NULL into *p_unicode
2548 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002549static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002550unicode_adjust_maxchar(PyObject **p_unicode)
2551{
2552 PyObject *unicode, *copy;
2553 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002554 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002555 unsigned int kind;
2556
2557 assert(p_unicode != NULL);
2558 unicode = *p_unicode;
2559 assert(PyUnicode_IS_READY(unicode));
2560 if (PyUnicode_IS_ASCII(unicode))
2561 return;
2562
2563 len = PyUnicode_GET_LENGTH(unicode);
2564 kind = PyUnicode_KIND(unicode);
2565 if (kind == PyUnicode_1BYTE_KIND) {
2566 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002567 max_char = ucs1lib_find_max_char(u, u + len);
2568 if (max_char >= 128)
2569 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002570 }
2571 else if (kind == PyUnicode_2BYTE_KIND) {
2572 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002573 max_char = ucs2lib_find_max_char(u, u + len);
2574 if (max_char >= 256)
2575 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002576 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002577 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002578 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002579 max_char = ucs4lib_find_max_char(u, u + len);
2580 if (max_char >= 0x10000)
2581 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002582 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002583 else
2584 Py_UNREACHABLE();
2585
Victor Stinner25a4b292011-10-06 12:31:55 +02002586 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002587 if (copy != NULL)
2588 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002589 Py_DECREF(unicode);
2590 *p_unicode = copy;
2591}
2592
Victor Stinner034f6cf2011-09-30 02:26:44 +02002593PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002594_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002595{
Victor Stinner87af4f22011-11-21 23:03:47 +01002596 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002597 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002598
Victor Stinner034f6cf2011-09-30 02:26:44 +02002599 if (!PyUnicode_Check(unicode)) {
2600 PyErr_BadInternalCall();
2601 return NULL;
2602 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002603 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002604 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002605
Victor Stinner87af4f22011-11-21 23:03:47 +01002606 length = PyUnicode_GET_LENGTH(unicode);
2607 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002608 if (!copy)
2609 return NULL;
2610 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2611
Christian Heimesf051e432016-09-13 20:22:02 +02002612 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002613 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002614 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002615 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002616}
2617
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002618
Victor Stinnerbc603d12011-10-02 01:00:40 +02002619/* Widen Unicode objects to larger buffers. Don't write terminating null
2620 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002621
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002622static void*
2623unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002624{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002625 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002626
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002627 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002628 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002629 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002630 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002631 if (!result)
2632 return PyErr_NoMemory();
2633 assert(skind == PyUnicode_1BYTE_KIND);
2634 _PyUnicode_CONVERT_BYTES(
2635 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002636 (const Py_UCS1 *)data,
2637 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002638 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002639 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002640 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002641 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002642 if (!result)
2643 return PyErr_NoMemory();
2644 if (skind == PyUnicode_2BYTE_KIND) {
2645 _PyUnicode_CONVERT_BYTES(
2646 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002647 (const Py_UCS2 *)data,
2648 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002649 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002650 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002651 else {
2652 assert(skind == PyUnicode_1BYTE_KIND);
2653 _PyUnicode_CONVERT_BYTES(
2654 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002655 (const Py_UCS1 *)data,
2656 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002657 result);
2658 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002659 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002660 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002661 Py_UNREACHABLE();
2662 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002663 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002664}
2665
2666static Py_UCS4*
2667as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2668 int copy_null)
2669{
2670 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002671 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002672 Py_ssize_t len, targetlen;
2673 if (PyUnicode_READY(string) == -1)
2674 return NULL;
2675 kind = PyUnicode_KIND(string);
2676 data = PyUnicode_DATA(string);
2677 len = PyUnicode_GET_LENGTH(string);
2678 targetlen = len;
2679 if (copy_null)
2680 targetlen++;
2681 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002682 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002683 if (!target) {
2684 PyErr_NoMemory();
2685 return NULL;
2686 }
2687 }
2688 else {
2689 if (targetsize < targetlen) {
2690 PyErr_Format(PyExc_SystemError,
2691 "string is longer than the buffer");
2692 if (copy_null && 0 < targetsize)
2693 target[0] = 0;
2694 return NULL;
2695 }
2696 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002697 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002698 const Py_UCS1 *start = (const Py_UCS1 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002699 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002700 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002701 else if (kind == PyUnicode_2BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002702 const Py_UCS2 *start = (const Py_UCS2 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002703 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2704 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002705 else if (kind == PyUnicode_4BYTE_KIND) {
Christian Heimesf051e432016-09-13 20:22:02 +02002706 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002707 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002708 else {
2709 Py_UNREACHABLE();
2710 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002711 if (copy_null)
2712 target[len] = 0;
2713 return target;
2714}
2715
2716Py_UCS4*
2717PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2718 int copy_null)
2719{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002720 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002721 PyErr_BadInternalCall();
2722 return NULL;
2723 }
2724 return as_ucs4(string, target, targetsize, copy_null);
2725}
2726
2727Py_UCS4*
2728PyUnicode_AsUCS4Copy(PyObject *string)
2729{
2730 return as_ucs4(string, NULL, 0, 1);
2731}
2732
Victor Stinner15a11362012-10-06 23:48:20 +02002733/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002734 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2735 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2736#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002737
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002738static int
2739unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2740 Py_ssize_t width, Py_ssize_t precision)
2741{
2742 Py_ssize_t length, fill, arglen;
2743 Py_UCS4 maxchar;
2744
2745 if (PyUnicode_READY(str) == -1)
2746 return -1;
2747
2748 length = PyUnicode_GET_LENGTH(str);
2749 if ((precision == -1 || precision >= length)
2750 && width <= length)
2751 return _PyUnicodeWriter_WriteStr(writer, str);
2752
2753 if (precision != -1)
2754 length = Py_MIN(precision, length);
2755
2756 arglen = Py_MAX(length, width);
2757 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2758 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2759 else
2760 maxchar = writer->maxchar;
2761
2762 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2763 return -1;
2764
2765 if (width > length) {
2766 fill = width - length;
2767 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2768 return -1;
2769 writer->pos += fill;
2770 }
2771
2772 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2773 str, 0, length);
2774 writer->pos += length;
2775 return 0;
2776}
2777
2778static int
Victor Stinner998b8062018-09-12 00:23:25 +02002779unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002780 Py_ssize_t width, Py_ssize_t precision)
2781{
2782 /* UTF-8 */
2783 Py_ssize_t length;
2784 PyObject *unicode;
2785 int res;
2786
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002787 if (precision == -1) {
2788 length = strlen(str);
2789 }
2790 else {
2791 length = 0;
2792 while (length < precision && str[length]) {
2793 length++;
2794 }
2795 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002796 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2797 if (unicode == NULL)
2798 return -1;
2799
2800 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2801 Py_DECREF(unicode);
2802 return res;
2803}
2804
Victor Stinner96865452011-03-01 23:44:09 +00002805static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002806unicode_fromformat_arg(_PyUnicodeWriter *writer,
2807 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002808{
Victor Stinnere215d962012-10-06 23:03:36 +02002809 const char *p;
2810 Py_ssize_t len;
2811 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002812 Py_ssize_t width;
2813 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002814 int longflag;
2815 int longlongflag;
2816 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002817 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002818
2819 p = f;
2820 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002821 zeropad = 0;
2822 if (*f == '0') {
2823 zeropad = 1;
2824 f++;
2825 }
Victor Stinner96865452011-03-01 23:44:09 +00002826
2827 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002828 width = -1;
2829 if (Py_ISDIGIT((unsigned)*f)) {
2830 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002831 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002832 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002833 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002834 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002835 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002836 return NULL;
2837 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002838 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002839 f++;
2840 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002841 }
2842 precision = -1;
2843 if (*f == '.') {
2844 f++;
2845 if (Py_ISDIGIT((unsigned)*f)) {
2846 precision = (*f - '0');
2847 f++;
2848 while (Py_ISDIGIT((unsigned)*f)) {
2849 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2850 PyErr_SetString(PyExc_ValueError,
2851 "precision too big");
2852 return NULL;
2853 }
2854 precision = (precision * 10) + (*f - '0');
2855 f++;
2856 }
2857 }
Victor Stinner96865452011-03-01 23:44:09 +00002858 if (*f == '%') {
2859 /* "%.3%s" => f points to "3" */
2860 f--;
2861 }
2862 }
2863 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002864 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002865 f--;
2866 }
Victor Stinner96865452011-03-01 23:44:09 +00002867
2868 /* Handle %ld, %lu, %lld and %llu. */
2869 longflag = 0;
2870 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002871 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002872 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002873 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002874 longflag = 1;
2875 ++f;
2876 }
Victor Stinner96865452011-03-01 23:44:09 +00002877 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002878 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002879 longlongflag = 1;
2880 f += 2;
2881 }
Victor Stinner96865452011-03-01 23:44:09 +00002882 }
2883 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002884 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002885 size_tflag = 1;
2886 ++f;
2887 }
Victor Stinnere215d962012-10-06 23:03:36 +02002888
2889 if (f[1] == '\0')
2890 writer->overallocate = 0;
2891
2892 switch (*f) {
2893 case 'c':
2894 {
2895 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002896 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002897 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002898 "character argument not in range(0x110000)");
2899 return NULL;
2900 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002901 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002902 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002903 break;
2904 }
2905
2906 case 'i':
2907 case 'd':
2908 case 'u':
2909 case 'x':
2910 {
2911 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002912 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002913 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002914
2915 if (*f == 'u') {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002916 if (longflag) {
2917 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2918 }
2919 else if (longlongflag) {
2920 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2921 }
2922 else if (size_tflag) {
2923 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2924 }
2925 else {
2926 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2927 }
Victor Stinnere215d962012-10-06 23:03:36 +02002928 }
2929 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002930 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002931 }
2932 else {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002933 if (longflag) {
2934 len = sprintf(buffer, "%li", va_arg(*vargs, long));
2935 }
2936 else if (longlongflag) {
2937 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2938 }
2939 else if (size_tflag) {
2940 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2941 }
2942 else {
2943 len = sprintf(buffer, "%i", va_arg(*vargs, int));
2944 }
Victor Stinnere215d962012-10-06 23:03:36 +02002945 }
2946 assert(len >= 0);
2947
Victor Stinnere215d962012-10-06 23:03:36 +02002948 if (precision < len)
2949 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002950
2951 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002952 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2953 return NULL;
2954
Victor Stinnere215d962012-10-06 23:03:36 +02002955 if (width > precision) {
2956 Py_UCS4 fillchar;
2957 fill = width - precision;
2958 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002959 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2960 return NULL;
2961 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002962 }
Victor Stinner15a11362012-10-06 23:48:20 +02002963 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002964 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002965 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2966 return NULL;
2967 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002968 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002969
Victor Stinner4a587072013-11-19 12:54:53 +01002970 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2971 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002972 break;
2973 }
2974
2975 case 'p':
2976 {
2977 char number[MAX_LONG_LONG_CHARS];
2978
2979 len = sprintf(number, "%p", va_arg(*vargs, void*));
2980 assert(len >= 0);
2981
2982 /* %p is ill-defined: ensure leading 0x. */
2983 if (number[1] == 'X')
2984 number[1] = 'x';
2985 else if (number[1] != 'x') {
2986 memmove(number + 2, number,
2987 strlen(number) + 1);
2988 number[0] = '0';
2989 number[1] = 'x';
2990 len += 2;
2991 }
2992
Victor Stinner4a587072013-11-19 12:54:53 +01002993 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002994 return NULL;
2995 break;
2996 }
2997
2998 case 's':
2999 {
3000 /* UTF-8 */
3001 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02003002 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003003 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003004 break;
3005 }
3006
3007 case 'U':
3008 {
3009 PyObject *obj = va_arg(*vargs, PyObject *);
3010 assert(obj && _PyUnicode_CHECK(obj));
3011
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003012 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003013 return NULL;
3014 break;
3015 }
3016
3017 case 'V':
3018 {
3019 PyObject *obj = va_arg(*vargs, PyObject *);
3020 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02003021 if (obj) {
3022 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003023 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003024 return NULL;
3025 }
3026 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003027 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02003028 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003029 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003030 }
3031 break;
3032 }
3033
3034 case 'S':
3035 {
3036 PyObject *obj = va_arg(*vargs, PyObject *);
3037 PyObject *str;
3038 assert(obj);
3039 str = PyObject_Str(obj);
3040 if (!str)
3041 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003042 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003043 Py_DECREF(str);
3044 return NULL;
3045 }
3046 Py_DECREF(str);
3047 break;
3048 }
3049
3050 case 'R':
3051 {
3052 PyObject *obj = va_arg(*vargs, PyObject *);
3053 PyObject *repr;
3054 assert(obj);
3055 repr = PyObject_Repr(obj);
3056 if (!repr)
3057 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003058 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003059 Py_DECREF(repr);
3060 return NULL;
3061 }
3062 Py_DECREF(repr);
3063 break;
3064 }
3065
3066 case 'A':
3067 {
3068 PyObject *obj = va_arg(*vargs, PyObject *);
3069 PyObject *ascii;
3070 assert(obj);
3071 ascii = PyObject_ASCII(obj);
3072 if (!ascii)
3073 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003074 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003075 Py_DECREF(ascii);
3076 return NULL;
3077 }
3078 Py_DECREF(ascii);
3079 break;
3080 }
3081
3082 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02003083 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003084 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003085 break;
3086
3087 default:
3088 /* if we stumble upon an unknown formatting code, copy the rest
3089 of the format string to the output string. (we cannot just
3090 skip the code, since there's no way to know what's in the
3091 argument list) */
3092 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01003093 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003094 return NULL;
3095 f = p+len;
3096 return f;
3097 }
3098
3099 f++;
Victor Stinner96865452011-03-01 23:44:09 +00003100 return f;
3101}
3102
Walter Dörwaldd2034312007-05-18 16:29:38 +00003103PyObject *
3104PyUnicode_FromFormatV(const char *format, va_list vargs)
3105{
Victor Stinnere215d962012-10-06 23:03:36 +02003106 va_list vargs2;
3107 const char *f;
3108 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003109
Victor Stinner8f674cc2013-04-17 23:02:17 +02003110 _PyUnicodeWriter_Init(&writer);
3111 writer.min_length = strlen(format) + 100;
3112 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003113
Benjamin Peterson0c212142016-09-20 20:39:33 -07003114 // Copy varags to be able to pass a reference to a subfunction.
3115 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003116
3117 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003118 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003119 f = unicode_fromformat_arg(&writer, f, &vargs2);
3120 if (f == NULL)
3121 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003122 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003123 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003124 const char *p;
3125 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003126
Victor Stinnere215d962012-10-06 23:03:36 +02003127 p = f;
3128 do
3129 {
3130 if ((unsigned char)*p > 127) {
3131 PyErr_Format(PyExc_ValueError,
3132 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3133 "string, got a non-ASCII byte: 0x%02x",
3134 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003135 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003136 }
3137 p++;
3138 }
3139 while (*p != '\0' && *p != '%');
3140 len = p - f;
3141
3142 if (*p == '\0')
3143 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003144
3145 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003146 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003147
3148 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003149 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003150 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003151 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003152 return _PyUnicodeWriter_Finish(&writer);
3153
3154 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003155 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003156 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003157 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003158}
3159
Walter Dörwaldd2034312007-05-18 16:29:38 +00003160PyObject *
3161PyUnicode_FromFormat(const char *format, ...)
3162{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003163 PyObject* ret;
3164 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003165
3166#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003167 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003168#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003169 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003170#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003171 ret = PyUnicode_FromFormatV(format, vargs);
3172 va_end(vargs);
3173 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003174}
3175
Serhiy Storchakac46db922018-10-23 22:58:24 +03003176static Py_ssize_t
3177unicode_get_widechar_size(PyObject *unicode)
3178{
3179 Py_ssize_t res;
3180
3181 assert(unicode != NULL);
3182 assert(_PyUnicode_CHECK(unicode));
3183
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003184#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchakac46db922018-10-23 22:58:24 +03003185 if (_PyUnicode_WSTR(unicode) != NULL) {
3186 return PyUnicode_WSTR_LENGTH(unicode);
3187 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003188#endif /* USE_UNICODE_WCHAR_CACHE */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003189 assert(PyUnicode_IS_READY(unicode));
3190
3191 res = _PyUnicode_LENGTH(unicode);
3192#if SIZEOF_WCHAR_T == 2
3193 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3194 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3195 const Py_UCS4 *end = s + res;
3196 for (; s < end; ++s) {
3197 if (*s > 0xFFFF) {
3198 ++res;
3199 }
3200 }
3201 }
3202#endif
3203 return res;
3204}
3205
3206static void
3207unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3208{
Serhiy Storchakac46db922018-10-23 22:58:24 +03003209 assert(unicode != NULL);
3210 assert(_PyUnicode_CHECK(unicode));
3211
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003212#if USE_UNICODE_WCHAR_CACHE
3213 const wchar_t *wstr = _PyUnicode_WSTR(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03003214 if (wstr != NULL) {
3215 memcpy(w, wstr, size * sizeof(wchar_t));
3216 return;
3217 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003218#else /* USE_UNICODE_WCHAR_CACHE */
3219 if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3220 memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3221 return;
3222 }
3223#endif /* USE_UNICODE_WCHAR_CACHE */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003224 assert(PyUnicode_IS_READY(unicode));
3225
3226 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3227 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3228 for (; size--; ++s, ++w) {
3229 *w = *s;
3230 }
3231 }
3232 else {
3233#if SIZEOF_WCHAR_T == 4
3234 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3235 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3236 for (; size--; ++s, ++w) {
3237 *w = *s;
3238 }
3239#else
3240 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3241 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3242 for (; size--; ++s, ++w) {
3243 Py_UCS4 ch = *s;
3244 if (ch > 0xFFFF) {
3245 assert(ch <= MAX_UNICODE);
3246 /* encode surrogate pair in this case */
3247 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3248 if (!size--)
3249 break;
3250 *w = Py_UNICODE_LOW_SURROGATE(ch);
3251 }
3252 else {
3253 *w = ch;
3254 }
3255 }
3256#endif
3257 }
3258}
3259
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003260#ifdef HAVE_WCHAR_H
3261
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003262/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003263
Victor Stinnerd88d9832011-09-06 02:00:05 +02003264 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003265 character) required to convert the unicode object. Ignore size argument.
3266
Victor Stinnerd88d9832011-09-06 02:00:05 +02003267 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003268 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003269 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003270Py_ssize_t
3271PyUnicode_AsWideChar(PyObject *unicode,
3272 wchar_t *w,
3273 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003274{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003275 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003276
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003277 if (unicode == NULL) {
3278 PyErr_BadInternalCall();
3279 return -1;
3280 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003281 if (!PyUnicode_Check(unicode)) {
3282 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003283 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003284 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003285
3286 res = unicode_get_widechar_size(unicode);
3287 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003288 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003289 }
3290
3291 if (size > res) {
3292 size = res + 1;
3293 }
3294 else {
3295 res = size;
3296 }
3297 unicode_copy_as_widechar(unicode, w, size);
3298 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003299}
3300
Victor Stinner137c34c2010-09-29 10:25:54 +00003301wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003302PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003303 Py_ssize_t *size)
3304{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003305 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003306 Py_ssize_t buflen;
3307
3308 if (unicode == NULL) {
3309 PyErr_BadInternalCall();
3310 return NULL;
3311 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003312 if (!PyUnicode_Check(unicode)) {
3313 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003314 return NULL;
3315 }
3316
Serhiy Storchakac46db922018-10-23 22:58:24 +03003317 buflen = unicode_get_widechar_size(unicode);
3318 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003319 if (buffer == NULL) {
3320 PyErr_NoMemory();
3321 return NULL;
3322 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003323 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3324 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003325 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003326 }
3327 else if (wcslen(buffer) != (size_t)buflen) {
Victor Stinner00d7abd2020-12-01 09:56:42 +01003328 PyMem_Free(buffer);
Serhiy Storchakac46db922018-10-23 22:58:24 +03003329 PyErr_SetString(PyExc_ValueError,
3330 "embedded null character");
3331 return NULL;
3332 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003333 return buffer;
3334}
3335
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003336#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003338int
3339_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3340{
3341 wchar_t **p = (wchar_t **)ptr;
3342 if (obj == NULL) {
3343#if !USE_UNICODE_WCHAR_CACHE
3344 PyMem_Free(*p);
3345#endif /* USE_UNICODE_WCHAR_CACHE */
3346 *p = NULL;
3347 return 1;
3348 }
3349 if (PyUnicode_Check(obj)) {
3350#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003351 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3352 if (*p == NULL) {
3353 return 0;
3354 }
3355 return 1;
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003356#else /* USE_UNICODE_WCHAR_CACHE */
3357 *p = PyUnicode_AsWideCharString(obj, NULL);
3358 if (*p == NULL) {
3359 return 0;
3360 }
3361 return Py_CLEANUP_SUPPORTED;
3362#endif /* USE_UNICODE_WCHAR_CACHE */
3363 }
3364 PyErr_Format(PyExc_TypeError,
3365 "argument must be str, not %.50s",
Victor Stinner8182cc22020-07-10 12:40:38 +02003366 Py_TYPE(obj)->tp_name);
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003367 return 0;
3368}
3369
3370int
3371_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3372{
3373 wchar_t **p = (wchar_t **)ptr;
3374 if (obj == NULL) {
3375#if !USE_UNICODE_WCHAR_CACHE
3376 PyMem_Free(*p);
3377#endif /* USE_UNICODE_WCHAR_CACHE */
3378 *p = NULL;
3379 return 1;
3380 }
3381 if (obj == Py_None) {
3382 *p = NULL;
3383 return 1;
3384 }
3385 if (PyUnicode_Check(obj)) {
3386#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003387 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3388 if (*p == NULL) {
3389 return 0;
3390 }
3391 return 1;
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003392#else /* USE_UNICODE_WCHAR_CACHE */
3393 *p = PyUnicode_AsWideCharString(obj, NULL);
3394 if (*p == NULL) {
3395 return 0;
3396 }
3397 return Py_CLEANUP_SUPPORTED;
3398#endif /* USE_UNICODE_WCHAR_CACHE */
3399 }
3400 PyErr_Format(PyExc_TypeError,
3401 "argument must be str or None, not %.50s",
Victor Stinner8182cc22020-07-10 12:40:38 +02003402 Py_TYPE(obj)->tp_name);
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003403 return 0;
3404}
3405
Alexander Belopolsky40018472011-02-26 01:02:56 +00003406PyObject *
3407PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003408{
Victor Stinner8faf8212011-12-08 22:14:11 +01003409 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003410 PyErr_SetString(PyExc_ValueError,
3411 "chr() arg not in range(0x110000)");
3412 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003413 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003414
Victor Stinner985a82a2014-01-03 12:53:47 +01003415 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003416}
3417
Alexander Belopolsky40018472011-02-26 01:02:56 +00003418PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003419PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003420{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003421 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003422 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003423 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003424 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003425 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003426 Py_INCREF(obj);
3427 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003428 }
3429 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003430 /* For a Unicode subtype that's not a Unicode object,
3431 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003432 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003433 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003434 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003435 "Can't convert '%.100s' object to str implicitly",
3436 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003437 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003438}
3439
Alexander Belopolsky40018472011-02-26 01:02:56 +00003440PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003441PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003442 const char *encoding,
3443 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003444{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003445 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003446 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003447
Guido van Rossumd57fd912000-03-10 22:53:23 +00003448 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003449 PyErr_BadInternalCall();
3450 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003451 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003452
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003453 /* Decoding bytes objects is the most common case and should be fast */
3454 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003455 if (PyBytes_GET_SIZE(obj) == 0) {
3456 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3457 return NULL;
3458 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003459 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003460 }
3461 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003462 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3463 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003464 }
3465
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003466 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003467 PyErr_SetString(PyExc_TypeError,
3468 "decoding str is not supported");
3469 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003470 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003471
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003472 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3473 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3474 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003475 "decoding to str: need a bytes-like object, %.80s found",
3476 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003477 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003478 }
Tim Petersced69f82003-09-16 20:30:58 +00003479
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003480 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003481 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003482 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3483 return NULL;
3484 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003485 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003486 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003487
Serhiy Storchaka05997252013-01-26 12:14:02 +02003488 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003489 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003490 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003491}
3492
Victor Stinnerebe17e02016-10-12 13:57:45 +02003493/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3494 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3495 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003496int
3497_Py_normalize_encoding(const char *encoding,
3498 char *lower,
3499 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003500{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003501 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003502 char *l;
3503 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003504 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003505
Victor Stinner942889a2016-09-05 15:40:10 -07003506 assert(encoding != NULL);
3507
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003508 e = encoding;
3509 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003510 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003511 punct = 0;
3512 while (1) {
3513 char c = *e;
3514 if (c == 0) {
3515 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003516 }
Victor Stinner942889a2016-09-05 15:40:10 -07003517
3518 if (Py_ISALNUM(c) || c == '.') {
3519 if (punct && l != lower) {
3520 if (l == l_end) {
3521 return 0;
3522 }
3523 *l++ = '_';
3524 }
3525 punct = 0;
3526
3527 if (l == l_end) {
3528 return 0;
3529 }
3530 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003531 }
3532 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003533 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003534 }
Victor Stinner942889a2016-09-05 15:40:10 -07003535
3536 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003537 }
3538 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003539 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003540}
3541
Alexander Belopolsky40018472011-02-26 01:02:56 +00003542PyObject *
3543PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003544 Py_ssize_t size,
3545 const char *encoding,
3546 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003547{
3548 PyObject *buffer = NULL, *unicode;
3549 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003550 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3551
Victor Stinner22eb6892019-06-26 00:51:05 +02003552 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3553 return NULL;
3554 }
3555
Victor Stinnered076ed2019-06-26 01:49:32 +02003556 if (size == 0) {
3557 _Py_RETURN_UNICODE_EMPTY();
3558 }
3559
Victor Stinner942889a2016-09-05 15:40:10 -07003560 if (encoding == NULL) {
3561 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3562 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003563
Fred Drakee4315f52000-05-09 19:53:39 +00003564 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003565 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3566 char *lower = buflower;
3567
3568 /* Fast paths */
3569 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3570 lower += 3;
3571 if (*lower == '_') {
3572 /* Match "utf8" and "utf_8" */
3573 lower++;
3574 }
3575
3576 if (lower[0] == '8' && lower[1] == 0) {
3577 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3578 }
3579 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3580 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3581 }
3582 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3583 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3584 }
3585 }
3586 else {
3587 if (strcmp(lower, "ascii") == 0
3588 || strcmp(lower, "us_ascii") == 0) {
3589 return PyUnicode_DecodeASCII(s, size, errors);
3590 }
Steve Dowercc16be82016-09-08 10:35:16 -07003591 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003592 else if (strcmp(lower, "mbcs") == 0) {
3593 return PyUnicode_DecodeMBCS(s, size, errors);
3594 }
3595 #endif
3596 else if (strcmp(lower, "latin1") == 0
3597 || strcmp(lower, "latin_1") == 0
3598 || strcmp(lower, "iso_8859_1") == 0
3599 || strcmp(lower, "iso8859_1") == 0) {
3600 return PyUnicode_DecodeLatin1(s, size, errors);
3601 }
3602 }
Victor Stinner37296e82010-06-10 13:36:23 +00003603 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604
3605 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003606 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003607 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003608 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003609 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003610 if (buffer == NULL)
3611 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003612 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003613 if (unicode == NULL)
3614 goto onError;
3615 if (!PyUnicode_Check(unicode)) {
3616 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003617 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003618 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003619 encoding,
3620 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003621 Py_DECREF(unicode);
3622 goto onError;
3623 }
3624 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003625 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003626
Benjamin Peterson29060642009-01-31 22:14:21 +00003627 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628 Py_XDECREF(buffer);
3629 return NULL;
3630}
3631
Alexander Belopolsky40018472011-02-26 01:02:56 +00003632PyObject *
3633PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003634 const char *encoding,
3635 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003636{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003637 if (!PyUnicode_Check(unicode)) {
3638 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003639 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003640 }
3641
Serhiy Storchaka00939072016-10-27 21:05:49 +03003642 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3643 "PyUnicode_AsDecodedObject() is deprecated; "
3644 "use PyCodec_Decode() to decode from str", 1) < 0)
3645 return NULL;
3646
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003647 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003648 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003649
3650 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003651 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003652}
3653
Alexander Belopolsky40018472011-02-26 01:02:56 +00003654PyObject *
3655PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003656 const char *encoding,
3657 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003658{
3659 PyObject *v;
3660
3661 if (!PyUnicode_Check(unicode)) {
3662 PyErr_BadArgument();
3663 goto onError;
3664 }
3665
Serhiy Storchaka00939072016-10-27 21:05:49 +03003666 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3667 "PyUnicode_AsDecodedUnicode() is deprecated; "
3668 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3669 return NULL;
3670
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003671 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003672 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003673
3674 /* Decode via the codec registry */
3675 v = PyCodec_Decode(unicode, encoding, errors);
3676 if (v == NULL)
3677 goto onError;
3678 if (!PyUnicode_Check(v)) {
3679 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003680 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003681 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003682 encoding,
3683 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003684 Py_DECREF(v);
3685 goto onError;
3686 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003687 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003688
Benjamin Peterson29060642009-01-31 22:14:21 +00003689 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003690 return NULL;
3691}
3692
Alexander Belopolsky40018472011-02-26 01:02:56 +00003693PyObject *
3694PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003695 Py_ssize_t size,
3696 const char *encoding,
3697 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003698{
3699 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003700
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003701 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003702 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003703 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003704 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3705 Py_DECREF(unicode);
3706 return v;
3707}
3708
Alexander Belopolsky40018472011-02-26 01:02:56 +00003709PyObject *
3710PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003711 const char *encoding,
3712 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003713{
3714 PyObject *v;
3715
3716 if (!PyUnicode_Check(unicode)) {
3717 PyErr_BadArgument();
3718 goto onError;
3719 }
3720
Serhiy Storchaka00939072016-10-27 21:05:49 +03003721 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3722 "PyUnicode_AsEncodedObject() is deprecated; "
3723 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3724 "or PyCodec_Encode() for generic encoding", 1) < 0)
3725 return NULL;
3726
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003727 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003728 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003729
3730 /* Encode via the codec registry */
3731 v = PyCodec_Encode(unicode, encoding, errors);
3732 if (v == NULL)
3733 goto onError;
3734 return v;
3735
Benjamin Peterson29060642009-01-31 22:14:21 +00003736 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003737 return NULL;
3738}
3739
Victor Stinner1b579672011-12-17 05:47:23 +01003740
Victor Stinner2cba6b82018-01-10 22:46:15 +01003741static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003742unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003743 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003744{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003745 Py_ssize_t wlen;
3746 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3747 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003748 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003749 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003750
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003751 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003752 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003753 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003754 return NULL;
3755 }
3756
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003757 char *str;
3758 size_t error_pos;
3759 const char *reason;
3760 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003761 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003762 PyMem_Free(wstr);
3763
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003764 if (res != 0) {
3765 if (res == -2) {
3766 PyObject *exc;
3767 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3768 "locale", unicode,
3769 (Py_ssize_t)error_pos,
3770 (Py_ssize_t)(error_pos+1),
3771 reason);
3772 if (exc != NULL) {
3773 PyCodec_StrictErrors(exc);
3774 Py_DECREF(exc);
3775 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003776 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003777 else if (res == -3) {
3778 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3779 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003780 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003781 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003782 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003783 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003784 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003785
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003786 PyObject *bytes = PyBytes_FromString(str);
3787 PyMem_RawFree(str);
3788 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003789}
3790
Victor Stinnerad158722010-10-27 00:25:46 +00003791PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003792PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3793{
Victor Stinner709d23d2019-05-02 14:56:30 -04003794 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3795 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003796}
3797
3798PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003799PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003800{
Victor Stinner81a7be32020-04-14 15:14:01 +02003801 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003802 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3803 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003804 return unicode_encode_utf8(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003805 fs_codec->error_handler,
3806 fs_codec->errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003807 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003808#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003809 else if (fs_codec->encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003810 return PyUnicode_AsEncodedString(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003811 fs_codec->encoding,
3812 fs_codec->errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003813 }
Victor Stinnerad158722010-10-27 00:25:46 +00003814#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003815 else {
3816 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3817 machinery is not ready and so cannot be used:
3818 use wcstombs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003819 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3820 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003821 assert(filesystem_errors != NULL);
3822 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3823 assert(errors != _Py_ERROR_UNKNOWN);
3824#ifdef _Py_FORCE_UTF8_FS_ENCODING
3825 return unicode_encode_utf8(unicode, errors, NULL);
3826#else
3827 return unicode_encode_locale(unicode, errors, 0);
3828#endif
3829 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003830}
3831
Alexander Belopolsky40018472011-02-26 01:02:56 +00003832PyObject *
3833PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003834 const char *encoding,
3835 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003836{
3837 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003838 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003839
Guido van Rossumd57fd912000-03-10 22:53:23 +00003840 if (!PyUnicode_Check(unicode)) {
3841 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003842 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003843 }
Fred Drakee4315f52000-05-09 19:53:39 +00003844
Victor Stinner22eb6892019-06-26 00:51:05 +02003845 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3846 return NULL;
3847 }
3848
Victor Stinner942889a2016-09-05 15:40:10 -07003849 if (encoding == NULL) {
3850 return _PyUnicode_AsUTF8String(unicode, errors);
3851 }
3852
Fred Drakee4315f52000-05-09 19:53:39 +00003853 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003854 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3855 char *lower = buflower;
3856
3857 /* Fast paths */
3858 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3859 lower += 3;
3860 if (*lower == '_') {
3861 /* Match "utf8" and "utf_8" */
3862 lower++;
3863 }
3864
3865 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003866 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003867 }
3868 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3869 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3870 }
3871 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3872 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3873 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003874 }
Victor Stinner942889a2016-09-05 15:40:10 -07003875 else {
3876 if (strcmp(lower, "ascii") == 0
3877 || strcmp(lower, "us_ascii") == 0) {
3878 return _PyUnicode_AsASCIIString(unicode, errors);
3879 }
Steve Dowercc16be82016-09-08 10:35:16 -07003880#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003881 else if (strcmp(lower, "mbcs") == 0) {
3882 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3883 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003884#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003885 else if (strcmp(lower, "latin1") == 0 ||
3886 strcmp(lower, "latin_1") == 0 ||
3887 strcmp(lower, "iso_8859_1") == 0 ||
3888 strcmp(lower, "iso8859_1") == 0) {
3889 return _PyUnicode_AsLatin1String(unicode, errors);
3890 }
3891 }
Victor Stinner37296e82010-06-10 13:36:23 +00003892 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003893
3894 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003895 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003896 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003897 return NULL;
3898
3899 /* The normal path */
3900 if (PyBytes_Check(v))
3901 return v;
3902
3903 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003904 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003905 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003906 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003907
3908 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003909 "encoder %s returned bytearray instead of bytes; "
3910 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003911 encoding);
3912 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003913 Py_DECREF(v);
3914 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003915 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003916
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003917 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3918 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003919 Py_DECREF(v);
3920 return b;
3921 }
3922
3923 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003924 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003925 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003926 encoding,
3927 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003928 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003929 return NULL;
3930}
3931
Alexander Belopolsky40018472011-02-26 01:02:56 +00003932PyObject *
3933PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003934 const char *encoding,
3935 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003936{
3937 PyObject *v;
3938
3939 if (!PyUnicode_Check(unicode)) {
3940 PyErr_BadArgument();
3941 goto onError;
3942 }
3943
Serhiy Storchaka00939072016-10-27 21:05:49 +03003944 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3945 "PyUnicode_AsEncodedUnicode() is deprecated; "
3946 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3947 return NULL;
3948
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003949 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003950 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003951
3952 /* Encode via the codec registry */
3953 v = PyCodec_Encode(unicode, encoding, errors);
3954 if (v == NULL)
3955 goto onError;
3956 if (!PyUnicode_Check(v)) {
3957 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003958 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003959 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003960 encoding,
3961 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003962 Py_DECREF(v);
3963 goto onError;
3964 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003965 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003966
Benjamin Peterson29060642009-01-31 22:14:21 +00003967 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003968 return NULL;
3969}
3970
Victor Stinner2cba6b82018-01-10 22:46:15 +01003971static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003972unicode_decode_locale(const char *str, Py_ssize_t len,
3973 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003974{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003975 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3976 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003977 return NULL;
3978 }
3979
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003980 wchar_t *wstr;
3981 size_t wlen;
3982 const char *reason;
3983 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003984 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003985 if (res != 0) {
3986 if (res == -2) {
3987 PyObject *exc;
3988 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3989 "locale", str, len,
3990 (Py_ssize_t)wlen,
3991 (Py_ssize_t)(wlen + 1),
3992 reason);
3993 if (exc != NULL) {
3994 PyCodec_StrictErrors(exc);
3995 Py_DECREF(exc);
3996 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003997 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003998 else if (res == -3) {
3999 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
4000 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01004001 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004002 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01004003 }
Victor Stinner2f197072011-12-17 07:08:30 +01004004 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01004005 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004006
4007 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
4008 PyMem_RawFree(wstr);
4009 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01004010}
4011
4012PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01004013PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
4014 const char *errors)
4015{
Victor Stinner709d23d2019-05-02 14:56:30 -04004016 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4017 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01004018}
4019
4020PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01004021PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01004022{
4023 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04004024 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4025 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01004026}
4027
4028
4029PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00004030PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004031 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00004032 return PyUnicode_DecodeFSDefaultAndSize(s, size);
4033}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004034
Christian Heimes5894ba72007-11-04 11:43:14 +00004035PyObject*
4036PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
4037{
Victor Stinner81a7be32020-04-14 15:14:01 +02004038 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02004039 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4040 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04004041 return unicode_decode_utf8(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02004042 fs_codec->error_handler,
4043 fs_codec->errors,
Victor Stinner709d23d2019-05-02 14:56:30 -04004044 NULL);
4045 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004046#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02004047 else if (fs_codec->encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08004048 return PyUnicode_Decode(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02004049 fs_codec->encoding,
4050 fs_codec->errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004051 }
Victor Stinnerad158722010-10-27 00:25:46 +00004052#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004053 else {
4054 /* Before _PyUnicode_InitEncodings() is called, the Python codec
4055 machinery is not ready and so cannot be used:
4056 use mbstowcs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02004057 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4058 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004059 assert(filesystem_errors != NULL);
4060 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4061 assert(errors != _Py_ERROR_UNKNOWN);
4062#ifdef _Py_FORCE_UTF8_FS_ENCODING
4063 return unicode_decode_utf8(s, size, errors, NULL, NULL);
4064#else
4065 return unicode_decode_locale(s, size, errors, 0);
4066#endif
4067 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004068}
4069
Martin v. Löwis011e8422009-05-05 04:43:17 +00004070
4071int
4072PyUnicode_FSConverter(PyObject* arg, void* addr)
4073{
Brett Cannonec6ce872016-09-06 15:50:29 -07004074 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004075 PyObject *output = NULL;
4076 Py_ssize_t size;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004077 const char *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004078 if (arg == NULL) {
4079 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08004080 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004081 return 1;
4082 }
Brett Cannonec6ce872016-09-06 15:50:29 -07004083 path = PyOS_FSPath(arg);
4084 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03004085 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004086 }
Brett Cannonec6ce872016-09-06 15:50:29 -07004087 if (PyBytes_Check(path)) {
4088 output = path;
4089 }
4090 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
4091 output = PyUnicode_EncodeFSDefault(path);
4092 Py_DECREF(path);
4093 if (!output) {
4094 return 0;
4095 }
4096 assert(PyBytes_Check(output));
4097 }
4098
Victor Stinner0ea2a462010-04-30 00:22:08 +00004099 size = PyBytes_GET_SIZE(output);
4100 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02004101 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004102 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00004103 Py_DECREF(output);
4104 return 0;
4105 }
4106 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004107 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004108}
4109
4110
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004111int
4112PyUnicode_FSDecoder(PyObject* arg, void* addr)
4113{
Brett Cannona5711202016-09-06 19:36:01 -07004114 int is_buffer = 0;
4115 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004116 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004117 if (arg == NULL) {
4118 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03004119 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004120 return 1;
4121 }
Brett Cannona5711202016-09-06 19:36:01 -07004122
4123 is_buffer = PyObject_CheckBuffer(arg);
4124 if (!is_buffer) {
4125 path = PyOS_FSPath(arg);
4126 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03004127 return 0;
4128 }
Brett Cannona5711202016-09-06 19:36:01 -07004129 }
4130 else {
4131 path = arg;
4132 Py_INCREF(arg);
4133 }
4134
4135 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07004136 output = path;
4137 }
4138 else if (PyBytes_Check(path) || is_buffer) {
4139 PyObject *path_bytes = NULL;
4140
4141 if (!PyBytes_Check(path) &&
4142 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02004143 "path should be string, bytes, or os.PathLike, not %.200s",
4144 Py_TYPE(arg)->tp_name)) {
4145 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004146 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07004147 }
4148 path_bytes = PyBytes_FromObject(path);
4149 Py_DECREF(path);
4150 if (!path_bytes) {
4151 return 0;
4152 }
4153 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4154 PyBytes_GET_SIZE(path_bytes));
4155 Py_DECREF(path_bytes);
4156 if (!output) {
4157 return 0;
4158 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004159 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004160 else {
4161 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02004162 "path should be string, bytes, or os.PathLike, not %.200s",
4163 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07004164 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004165 return 0;
4166 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004167 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02004168 Py_DECREF(output);
4169 return 0;
4170 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004171 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02004172 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004173 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004174 Py_DECREF(output);
4175 return 0;
4176 }
4177 *(PyObject**)addr = output;
4178 return Py_CLEANUP_SUPPORTED;
4179}
4180
4181
Inada Naoki02a4d572020-02-27 13:48:59 +09004182static int unicode_fill_utf8(PyObject *unicode);
4183
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004184const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004185PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004186{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004187 if (!PyUnicode_Check(unicode)) {
4188 PyErr_BadArgument();
4189 return NULL;
4190 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004191 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004192 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004193
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004194 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004195 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004196 return NULL;
4197 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004198 }
4199
4200 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004201 *psize = PyUnicode_UTF8_LENGTH(unicode);
4202 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004203}
4204
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004205const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004206PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004207{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004208 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4209}
4210
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004211Py_UNICODE *
4212PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4213{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004214 if (!PyUnicode_Check(unicode)) {
4215 PyErr_BadArgument();
4216 return NULL;
4217 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004218 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4219 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004220 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004221 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004222 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004223
Serhiy Storchakac46db922018-10-23 22:58:24 +03004224 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4225 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4226 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004227 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004228 }
Victor Stinner32bd68c2020-12-01 10:37:39 +01004229 w = (wchar_t *) PyObject_Malloc(sizeof(wchar_t) * (wlen + 1));
Serhiy Storchakac46db922018-10-23 22:58:24 +03004230 if (w == NULL) {
4231 PyErr_NoMemory();
4232 return NULL;
4233 }
4234 unicode_copy_as_widechar(unicode, w, wlen + 1);
4235 _PyUnicode_WSTR(unicode) = w;
4236 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4237 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004238 }
4239 }
4240 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004241 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004242 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004243}
4244
Inada Naoki2c4928d2020-06-17 20:09:44 +09004245/* Deprecated APIs */
4246
4247_Py_COMP_DIAG_PUSH
4248_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4249
Alexander Belopolsky40018472011-02-26 01:02:56 +00004250Py_UNICODE *
4251PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004252{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004253 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004254}
4255
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004256const Py_UNICODE *
4257_PyUnicode_AsUnicode(PyObject *unicode)
4258{
4259 Py_ssize_t size;
4260 const Py_UNICODE *wstr;
4261
4262 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4263 if (wstr && wcslen(wstr) != (size_t)size) {
4264 PyErr_SetString(PyExc_ValueError, "embedded null character");
4265 return NULL;
4266 }
4267 return wstr;
4268}
4269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004270
Alexander Belopolsky40018472011-02-26 01:02:56 +00004271Py_ssize_t
4272PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004273{
4274 if (!PyUnicode_Check(unicode)) {
4275 PyErr_BadArgument();
4276 goto onError;
4277 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004278 if (_PyUnicode_WSTR(unicode) == NULL) {
4279 if (PyUnicode_AsUnicode(unicode) == NULL)
4280 goto onError;
4281 }
4282 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004283
Benjamin Peterson29060642009-01-31 22:14:21 +00004284 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004285 return -1;
4286}
4287
Inada Naoki2c4928d2020-06-17 20:09:44 +09004288_Py_COMP_DIAG_POP
4289
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004290Py_ssize_t
4291PyUnicode_GetLength(PyObject *unicode)
4292{
Victor Stinner07621332012-06-16 04:53:46 +02004293 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004294 PyErr_BadArgument();
4295 return -1;
4296 }
Victor Stinner07621332012-06-16 04:53:46 +02004297 if (PyUnicode_READY(unicode) == -1)
4298 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004299 return PyUnicode_GET_LENGTH(unicode);
4300}
4301
4302Py_UCS4
4303PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4304{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004305 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02004306 int kind;
4307
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004308 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004309 PyErr_BadArgument();
4310 return (Py_UCS4)-1;
4311 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004312 if (PyUnicode_READY(unicode) == -1) {
4313 return (Py_UCS4)-1;
4314 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004315 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004316 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004317 return (Py_UCS4)-1;
4318 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004319 data = PyUnicode_DATA(unicode);
4320 kind = PyUnicode_KIND(unicode);
4321 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004322}
4323
4324int
4325PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4326{
4327 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004328 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004329 return -1;
4330 }
Victor Stinner488fa492011-12-12 00:01:39 +01004331 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004332 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004333 PyErr_SetString(PyExc_IndexError, "string index out of range");
4334 return -1;
4335 }
Victor Stinner488fa492011-12-12 00:01:39 +01004336 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004337 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004338 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4339 PyErr_SetString(PyExc_ValueError, "character out of range");
4340 return -1;
4341 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004342 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4343 index, ch);
4344 return 0;
4345}
4346
Alexander Belopolsky40018472011-02-26 01:02:56 +00004347const char *
4348PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004349{
Victor Stinner42cb4622010-09-01 19:39:01 +00004350 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004351}
4352
Victor Stinner554f3f02010-06-16 23:33:54 +00004353/* create or adjust a UnicodeDecodeError */
4354static void
4355make_decode_exception(PyObject **exceptionObject,
4356 const char *encoding,
4357 const char *input, Py_ssize_t length,
4358 Py_ssize_t startpos, Py_ssize_t endpos,
4359 const char *reason)
4360{
4361 if (*exceptionObject == NULL) {
4362 *exceptionObject = PyUnicodeDecodeError_Create(
4363 encoding, input, length, startpos, endpos, reason);
4364 }
4365 else {
4366 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4367 goto onError;
4368 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4369 goto onError;
4370 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4371 goto onError;
4372 }
4373 return;
4374
4375onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004376 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004377}
4378
Steve Dowercc16be82016-09-08 10:35:16 -07004379#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004380static int
4381widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4382{
4383 if (newsize > *size) {
4384 wchar_t *newbuf = *buf;
4385 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4386 PyErr_NoMemory();
4387 return -1;
4388 }
4389 *buf = newbuf;
4390 }
4391 *size = newsize;
4392 return 0;
4393}
4394
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004395/* error handling callback helper:
4396 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004397 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004398 and adjust various state variables.
4399 return 0 on success, -1 on error
4400*/
4401
Alexander Belopolsky40018472011-02-26 01:02:56 +00004402static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004403unicode_decode_call_errorhandler_wchar(
4404 const char *errors, PyObject **errorHandler,
4405 const char *encoding, const char *reason,
4406 const char **input, const char **inend, Py_ssize_t *startinpos,
4407 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004408 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004409{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004410 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004411
4412 PyObject *restuple = NULL;
4413 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004414 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004415 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004416 Py_ssize_t requiredsize;
4417 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004418 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004419 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004420
4421 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004422 *errorHandler = PyCodec_LookupError(errors);
4423 if (*errorHandler == NULL)
4424 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004425 }
4426
Victor Stinner554f3f02010-06-16 23:33:54 +00004427 make_decode_exception(exceptionObject,
4428 encoding,
4429 *input, *inend - *input,
4430 *startinpos, *endinpos,
4431 reason);
4432 if (*exceptionObject == NULL)
4433 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434
Petr Viktorinffd97532020-02-11 17:46:57 +01004435 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004436 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004437 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004438 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004439 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004440 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004441 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004442 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004443 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004444
4445 /* Copy back the bytes variables, which might have been modified by the
4446 callback */
4447 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4448 if (!inputobj)
4449 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004450 *input = PyBytes_AS_STRING(inputobj);
4451 insize = PyBytes_GET_SIZE(inputobj);
4452 *inend = *input + insize;
4453 /* we can DECREF safely, as the exception has another reference,
4454 so the object won't go away. */
4455 Py_DECREF(inputobj);
4456
4457 if (newpos<0)
4458 newpos = insize+newpos;
4459 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004460 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004461 goto onError;
4462 }
4463
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004464#if USE_UNICODE_WCHAR_CACHE
4465_Py_COMP_DIAG_PUSH
4466_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4467 repwlen = PyUnicode_GetSize(repunicode);
4468 if (repwlen < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004469 goto onError;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004470_Py_COMP_DIAG_POP
4471#else /* USE_UNICODE_WCHAR_CACHE */
4472 repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4473 if (repwlen < 0)
4474 goto onError;
4475 repwlen--;
4476#endif /* USE_UNICODE_WCHAR_CACHE */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004477 /* need more space? (at least enough for what we
4478 have+the replacement+the rest of the string (starting
4479 at the new input position), so we won't have to check space
4480 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004481 requiredsize = *outpos;
4482 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4483 goto overflow;
4484 requiredsize += repwlen;
4485 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4486 goto overflow;
4487 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004488 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004489 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004490 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004491 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004492 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004493 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004494 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004495 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004496 PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004497 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004498 *endinpos = newpos;
4499 *inptr = *input + newpos;
4500
4501 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004502 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004503 return 0;
4504
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004505 overflow:
4506 PyErr_SetString(PyExc_OverflowError,
4507 "decoded result is too long for a Python string");
4508
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004509 onError:
4510 Py_XDECREF(restuple);
4511 return -1;
4512}
Steve Dowercc16be82016-09-08 10:35:16 -07004513#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004514
4515static int
4516unicode_decode_call_errorhandler_writer(
4517 const char *errors, PyObject **errorHandler,
4518 const char *encoding, const char *reason,
4519 const char **input, const char **inend, Py_ssize_t *startinpos,
4520 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4521 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4522{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004523 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004524
4525 PyObject *restuple = NULL;
4526 PyObject *repunicode = NULL;
4527 Py_ssize_t insize;
4528 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004529 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004530 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004531 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004532 int need_to_grow = 0;
4533 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004534
4535 if (*errorHandler == NULL) {
4536 *errorHandler = PyCodec_LookupError(errors);
4537 if (*errorHandler == NULL)
4538 goto onError;
4539 }
4540
4541 make_decode_exception(exceptionObject,
4542 encoding,
4543 *input, *inend - *input,
4544 *startinpos, *endinpos,
4545 reason);
4546 if (*exceptionObject == NULL)
4547 goto onError;
4548
Petr Viktorinffd97532020-02-11 17:46:57 +01004549 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004550 if (restuple == NULL)
4551 goto onError;
4552 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004553 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004554 goto onError;
4555 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004556 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004557 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004558
4559 /* Copy back the bytes variables, which might have been modified by the
4560 callback */
4561 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4562 if (!inputobj)
4563 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004564 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004565 *input = PyBytes_AS_STRING(inputobj);
4566 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004567 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004568 /* we can DECREF safely, as the exception has another reference,
4569 so the object won't go away. */
4570 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004571
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004572 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004573 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004574 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004575 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004576 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004577 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004578
Victor Stinner170ca6f2013-04-18 00:25:28 +02004579 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004580 if (replen > 1) {
4581 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004582 need_to_grow = 1;
4583 }
4584 new_inptr = *input + newpos;
4585 if (*inend - new_inptr > remain) {
4586 /* We don't know the decoding algorithm here so we make the worst
4587 assumption that one byte decodes to one unicode character.
4588 If unfortunately one byte could decode to more unicode characters,
4589 the decoder may write out-of-bound then. Is it possible for the
4590 algorithms using this function? */
4591 writer->min_length += *inend - new_inptr - remain;
4592 need_to_grow = 1;
4593 }
4594 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004595 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004596 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004597 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4598 goto onError;
4599 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004600 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004601 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004602
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004603 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004604 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004605
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004606 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004607 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004608 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004609
Benjamin Peterson29060642009-01-31 22:14:21 +00004610 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004611 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004612 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004613}
4614
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004615/* --- UTF-7 Codec -------------------------------------------------------- */
4616
Antoine Pitrou244651a2009-05-04 18:56:13 +00004617/* See RFC2152 for details. We encode conservatively and decode liberally. */
4618
4619/* Three simple macros defining base-64. */
4620
4621/* Is c a base-64 character? */
4622
4623#define IS_BASE64(c) \
4624 (((c) >= 'A' && (c) <= 'Z') || \
4625 ((c) >= 'a' && (c) <= 'z') || \
4626 ((c) >= '0' && (c) <= '9') || \
4627 (c) == '+' || (c) == '/')
4628
4629/* given that c is a base-64 character, what is its base-64 value? */
4630
4631#define FROM_BASE64(c) \
4632 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4633 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4634 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4635 (c) == '+' ? 62 : 63)
4636
4637/* What is the base-64 character of the bottom 6 bits of n? */
4638
4639#define TO_BASE64(n) \
4640 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4641
4642/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4643 * decoded as itself. We are permissive on decoding; the only ASCII
4644 * byte not decoding to itself is the + which begins a base64
4645 * string. */
4646
4647#define DECODE_DIRECT(c) \
4648 ((c) <= 127 && (c) != '+')
4649
4650/* The UTF-7 encoder treats ASCII characters differently according to
4651 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4652 * the above). See RFC2152. This array identifies these different
4653 * sets:
4654 * 0 : "Set D"
4655 * alphanumeric and '(),-./:?
4656 * 1 : "Set O"
4657 * !"#$%&*;<=>@[]^_`{|}
4658 * 2 : "whitespace"
4659 * ht nl cr sp
4660 * 3 : special (must be base64 encoded)
4661 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4662 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004663
Tim Petersced69f82003-09-16 20:30:58 +00004664static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004665char utf7_category[128] = {
4666/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4667 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4668/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4669 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4670/* sp ! " # $ % & ' ( ) * + , - . / */
4671 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4672/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4673 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4674/* @ A B C D E F G H I J K L M N O */
4675 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4676/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4677 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4678/* ` a b c d e f g h i j k l m n o */
4679 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4680/* p q r s t u v w x y z { | } ~ del */
4681 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004682};
4683
Antoine Pitrou244651a2009-05-04 18:56:13 +00004684/* ENCODE_DIRECT: this character should be encoded as itself. The
4685 * answer depends on whether we are encoding set O as itself, and also
4686 * on whether we are encoding whitespace as itself. RFC2152 makes it
4687 * clear that the answers to these questions vary between
4688 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004689
Antoine Pitrou244651a2009-05-04 18:56:13 +00004690#define ENCODE_DIRECT(c, directO, directWS) \
4691 ((c) < 128 && (c) > 0 && \
4692 ((utf7_category[(c)] == 0) || \
4693 (directWS && (utf7_category[(c)] == 2)) || \
4694 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004695
Alexander Belopolsky40018472011-02-26 01:02:56 +00004696PyObject *
4697PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004698 Py_ssize_t size,
4699 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004700{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004701 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4702}
4703
Antoine Pitrou244651a2009-05-04 18:56:13 +00004704/* The decoder. The only state we preserve is our read position,
4705 * i.e. how many characters we have consumed. So if we end in the
4706 * middle of a shift sequence we have to back off the read position
4707 * and the output to the beginning of the sequence, otherwise we lose
4708 * all the shift state (seen bits, number of bits seen, high
4709 * surrogate). */
4710
Alexander Belopolsky40018472011-02-26 01:02:56 +00004711PyObject *
4712PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004713 Py_ssize_t size,
4714 const char *errors,
4715 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004716{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004717 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004718 Py_ssize_t startinpos;
4719 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004720 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004721 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004722 const char *errmsg = "";
4723 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004724 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004725 unsigned int base64bits = 0;
4726 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004727 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004728 PyObject *errorHandler = NULL;
4729 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004730
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004731 if (size == 0) {
4732 if (consumed)
4733 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004734 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004735 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004736
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004737 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004738 _PyUnicodeWriter_Init(&writer);
4739 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004740
4741 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004742 e = s + size;
4743
4744 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004745 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004746 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004747 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004748
Antoine Pitrou244651a2009-05-04 18:56:13 +00004749 if (inShift) { /* in a base-64 section */
4750 if (IS_BASE64(ch)) { /* consume a base-64 character */
4751 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4752 base64bits += 6;
4753 s++;
4754 if (base64bits >= 16) {
4755 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004756 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004757 base64bits -= 16;
4758 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004759 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004760 if (surrogate) {
4761 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004762 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4763 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004764 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004765 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004766 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004767 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004768 }
4769 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004770 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004771 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004772 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004773 }
4774 }
Victor Stinner551ac952011-11-29 22:58:13 +01004775 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004776 /* first surrogate */
4777 surrogate = outCh;
4778 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004779 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004780 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004781 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004782 }
4783 }
4784 }
4785 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004786 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004787 if (base64bits > 0) { /* left-over bits */
4788 if (base64bits >= 6) {
4789 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004790 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004791 errmsg = "partial character in shift sequence";
4792 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004793 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004794 else {
4795 /* Some bits remain; they should be zero */
4796 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004797 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004798 errmsg = "non-zero padding bits in shift sequence";
4799 goto utf7Error;
4800 }
4801 }
4802 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004803 if (surrogate && DECODE_DIRECT(ch)) {
4804 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4805 goto onError;
4806 }
4807 surrogate = 0;
4808 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004809 /* '-' is absorbed; other terminating
4810 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004811 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004812 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004813 }
4814 }
4815 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004816 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004817 s++; /* consume '+' */
4818 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004819 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004820 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004821 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004822 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004823 else if (s < e && !IS_BASE64(*s)) {
4824 s++;
4825 errmsg = "ill-formed sequence";
4826 goto utf7Error;
4827 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004828 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004829 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004830 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004831 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004832 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004833 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004834 }
4835 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004836 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004837 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004838 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004839 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004840 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004841 else {
4842 startinpos = s-starts;
4843 s++;
4844 errmsg = "unexpected special character";
4845 goto utf7Error;
4846 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004847 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004848utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004849 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004850 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004851 errors, &errorHandler,
4852 "utf7", errmsg,
4853 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004854 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004855 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004856 }
4857
Antoine Pitrou244651a2009-05-04 18:56:13 +00004858 /* end of string */
4859
4860 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4861 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004862 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004863 if (surrogate ||
4864 (base64bits >= 6) ||
4865 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004866 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004867 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004868 errors, &errorHandler,
4869 "utf7", "unterminated shift sequence",
4870 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004871 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004872 goto onError;
4873 if (s < e)
4874 goto restart;
4875 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004876 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004877
4878 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004879 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004880 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004881 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004882 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004883 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004884 writer.kind, writer.data, shiftOutStart);
4885 Py_XDECREF(errorHandler);
4886 Py_XDECREF(exc);
4887 _PyUnicodeWriter_Dealloc(&writer);
4888 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004889 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004890 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004891 }
4892 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004893 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004894 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004895 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004896
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004897 Py_XDECREF(errorHandler);
4898 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004899 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004900
Benjamin Peterson29060642009-01-31 22:14:21 +00004901 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004902 Py_XDECREF(errorHandler);
4903 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004904 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004905 return NULL;
4906}
4907
4908
Alexander Belopolsky40018472011-02-26 01:02:56 +00004909PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004910_PyUnicode_EncodeUTF7(PyObject *str,
4911 int base64SetO,
4912 int base64WhiteSpace,
4913 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004914{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004915 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004916 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004917 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004918 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004919 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004920 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004921 unsigned int base64bits = 0;
4922 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004923 char * out;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004924 const char * start;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004925
Benjamin Petersonbac79492012-01-14 13:34:47 -05004926 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004927 return NULL;
4928 kind = PyUnicode_KIND(str);
4929 data = PyUnicode_DATA(str);
4930 len = PyUnicode_GET_LENGTH(str);
4931
4932 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004933 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004934
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004935 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004936 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004937 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004938 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004939 if (v == NULL)
4940 return NULL;
4941
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004942 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004943 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004944 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004945
Antoine Pitrou244651a2009-05-04 18:56:13 +00004946 if (inShift) {
4947 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4948 /* shifting out */
4949 if (base64bits) { /* output remaining bits */
4950 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4951 base64buffer = 0;
4952 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004953 }
4954 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004955 /* Characters not in the BASE64 set implicitly unshift the sequence
4956 so no '-' is required, except if the character is itself a '-' */
4957 if (IS_BASE64(ch) || ch == '-') {
4958 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004959 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004960 *out++ = (char) ch;
4961 }
4962 else {
4963 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004964 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004965 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004966 else { /* not in a shift sequence */
4967 if (ch == '+') {
4968 *out++ = '+';
4969 *out++ = '-';
4970 }
4971 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4972 *out++ = (char) ch;
4973 }
4974 else {
4975 *out++ = '+';
4976 inShift = 1;
4977 goto encode_char;
4978 }
4979 }
4980 continue;
4981encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004982 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004983 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004984
Antoine Pitrou244651a2009-05-04 18:56:13 +00004985 /* code first surrogate */
4986 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004987 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004988 while (base64bits >= 6) {
4989 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4990 base64bits -= 6;
4991 }
4992 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004993 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004994 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004995 base64bits += 16;
4996 base64buffer = (base64buffer << 16) | ch;
4997 while (base64bits >= 6) {
4998 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4999 base64bits -= 6;
5000 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00005001 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00005002 if (base64bits)
5003 *out++= TO_BASE64(base64buffer << (6-base64bits) );
5004 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005005 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005006 if (_PyBytes_Resize(&v, out - start) < 0)
5007 return NULL;
5008 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005009}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005010PyObject *
5011PyUnicode_EncodeUTF7(const Py_UNICODE *s,
5012 Py_ssize_t size,
5013 int base64SetO,
5014 int base64WhiteSpace,
5015 const char *errors)
5016{
5017 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005018 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005019 if (tmp == NULL)
5020 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01005021 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005022 base64WhiteSpace, errors);
5023 Py_DECREF(tmp);
5024 return result;
5025}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005026
Antoine Pitrou244651a2009-05-04 18:56:13 +00005027#undef IS_BASE64
5028#undef FROM_BASE64
5029#undef TO_BASE64
5030#undef DECODE_DIRECT
5031#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005032
Guido van Rossumd57fd912000-03-10 22:53:23 +00005033/* --- UTF-8 Codec -------------------------------------------------------- */
5034
Alexander Belopolsky40018472011-02-26 01:02:56 +00005035PyObject *
5036PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005037 Py_ssize_t size,
5038 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005039{
Walter Dörwald69652032004-09-07 20:24:22 +00005040 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
5041}
5042
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005043#include "stringlib/asciilib.h"
5044#include "stringlib/codecs.h"
5045#include "stringlib/undef.h"
5046
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01005047#include "stringlib/ucs1lib.h"
5048#include "stringlib/codecs.h"
5049#include "stringlib/undef.h"
5050
5051#include "stringlib/ucs2lib.h"
5052#include "stringlib/codecs.h"
5053#include "stringlib/undef.h"
5054
5055#include "stringlib/ucs4lib.h"
5056#include "stringlib/codecs.h"
5057#include "stringlib/undef.h"
5058
Ma Lina0c603c2020-10-18 22:48:38 +08005059/* Mask to quickly check whether a C 'size_t' contains a
Antoine Pitrouab868312009-01-10 15:40:25 +00005060 non-ASCII, UTF8-encoded char. */
Ma Lina0c603c2020-10-18 22:48:38 +08005061#if (SIZEOF_SIZE_T == 8)
5062# define ASCII_CHAR_MASK 0x8080808080808080ULL
5063#elif (SIZEOF_SIZE_T == 4)
5064# define ASCII_CHAR_MASK 0x80808080U
Antoine Pitrouab868312009-01-10 15:40:25 +00005065#else
Ma Lina0c603c2020-10-18 22:48:38 +08005066# error C 'size_t' size should be either 4 or 8!
Antoine Pitrouab868312009-01-10 15:40:25 +00005067#endif
5068
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005069static Py_ssize_t
5070ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005071{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005072 const char *p = start;
Ma Lina0c603c2020-10-18 22:48:38 +08005073 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_SIZE_T);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005074
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02005075 /*
5076 * Issue #17237: m68k is a bit different from most architectures in
5077 * that objects do not use "natural alignment" - for example, int and
5078 * long are only aligned at 2-byte boundaries. Therefore the assert()
5079 * won't work; also, tests have shown that skipping the "optimised
5080 * version" will even speed up m68k.
5081 */
5082#if !defined(__m68k__)
Ma Lina0c603c2020-10-18 22:48:38 +08005083#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
5084 assert(_Py_IS_ALIGNED(dest, SIZEOF_SIZE_T));
5085 if (_Py_IS_ALIGNED(p, SIZEOF_SIZE_T)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005086 /* Fast path, see in STRINGLIB(utf8_decode) for
5087 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02005088 /* Help allocation */
5089 const char *_p = p;
5090 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005091 while (_p < aligned_end) {
Ma Lina0c603c2020-10-18 22:48:38 +08005092 size_t value = *(const size_t *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005093 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00005094 break;
Ma Lina0c603c2020-10-18 22:48:38 +08005095 *((size_t *)q) = value;
5096 _p += SIZEOF_SIZE_T;
5097 q += SIZEOF_SIZE_T;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005098 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005099 p = _p;
5100 while (p < end) {
5101 if ((unsigned char)*p & 0x80)
5102 break;
5103 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005104 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005105 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005106 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005107#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02005108#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005109 while (p < end) {
5110 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5111 for an explanation. */
Ma Lina0c603c2020-10-18 22:48:38 +08005112 if (_Py_IS_ALIGNED(p, SIZEOF_SIZE_T)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02005113 /* Help allocation */
5114 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005115 while (_p < aligned_end) {
Ma Lina0c603c2020-10-18 22:48:38 +08005116 size_t value = *(const size_t *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005117 if (value & ASCII_CHAR_MASK)
5118 break;
Ma Lina0c603c2020-10-18 22:48:38 +08005119 _p += SIZEOF_SIZE_T;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005120 }
5121 p = _p;
5122 if (_p == end)
5123 break;
5124 }
5125 if ((unsigned char)*p & 0x80)
5126 break;
5127 ++p;
5128 }
5129 memcpy(dest, start, p - start);
5130 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131}
Antoine Pitrouab868312009-01-10 15:40:25 +00005132
Victor Stinner709d23d2019-05-02 14:56:30 -04005133static PyObject *
5134unicode_decode_utf8(const char *s, Py_ssize_t size,
5135 _Py_error_handler error_handler, const char *errors,
5136 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01005137{
Victor Stinner785938e2011-12-11 20:09:03 +01005138 if (size == 0) {
5139 if (consumed)
5140 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005141 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01005142 }
5143
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005144 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5145 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner2f9ada92020-06-24 02:22:21 +02005146 if (consumed) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005147 *consumed = 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02005148 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005149 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01005150 }
5151
Inada Naoki770847a2019-06-24 12:30:24 +09005152 const char *starts = s;
5153 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01005154
Inada Naoki770847a2019-06-24 12:30:24 +09005155 // fast path: try ASCII string.
5156 PyObject *u = PyUnicode_New(size, 127);
5157 if (u == NULL) {
5158 return NULL;
5159 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005160 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09005161 if (s == end) {
5162 return u;
5163 }
5164
5165 // Use _PyUnicodeWriter after fast path is failed.
5166 _PyUnicodeWriter writer;
5167 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5168 writer.pos = s - starts;
5169
5170 Py_ssize_t startinpos, endinpos;
5171 const char *errmsg = "";
5172 PyObject *error_handler_obj = NULL;
5173 PyObject *exc = NULL;
5174
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005175 while (s < end) {
5176 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005177 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005178
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005179 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005180 if (PyUnicode_IS_ASCII(writer.buffer))
5181 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005182 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005183 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005184 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005185 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005186 } else {
5187 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005188 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005189 }
5190
5191 switch (ch) {
5192 case 0:
5193 if (s == end || consumed)
5194 goto End;
5195 errmsg = "unexpected end of data";
5196 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005197 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005198 break;
5199 case 1:
5200 errmsg = "invalid start byte";
5201 startinpos = s - starts;
5202 endinpos = startinpos + 1;
5203 break;
5204 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005205 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5206 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5207 {
5208 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005209 goto End;
5210 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005211 /* fall through */
5212 case 3:
5213 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005214 errmsg = "invalid continuation byte";
5215 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005216 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005217 break;
5218 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005219 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005220 goto onError;
5221 continue;
5222 }
5223
Victor Stinner1d65d912015-10-05 13:43:50 +02005224 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005225 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005226
5227 switch (error_handler) {
5228 case _Py_ERROR_IGNORE:
5229 s += (endinpos - startinpos);
5230 break;
5231
5232 case _Py_ERROR_REPLACE:
5233 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5234 goto onError;
5235 s += (endinpos - startinpos);
5236 break;
5237
5238 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005239 {
5240 Py_ssize_t i;
5241
Victor Stinner1d65d912015-10-05 13:43:50 +02005242 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5243 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005244 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005245 ch = (Py_UCS4)(unsigned char)(starts[i]);
5246 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5247 ch + 0xdc00);
5248 writer.pos++;
5249 }
5250 s += (endinpos - startinpos);
5251 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005252 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005253
5254 default:
5255 if (unicode_decode_call_errorhandler_writer(
5256 errors, &error_handler_obj,
5257 "utf-8", errmsg,
5258 &starts, &end, &startinpos, &endinpos, &exc, &s,
5259 &writer))
5260 goto onError;
5261 }
Victor Stinner785938e2011-12-11 20:09:03 +01005262 }
5263
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005264End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005265 if (consumed)
5266 *consumed = s - starts;
5267
Victor Stinner1d65d912015-10-05 13:43:50 +02005268 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005269 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005270 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005271
5272onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005273 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005274 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005275 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005276 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005277}
5278
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005279
Victor Stinner709d23d2019-05-02 14:56:30 -04005280PyObject *
5281PyUnicode_DecodeUTF8Stateful(const char *s,
5282 Py_ssize_t size,
5283 const char *errors,
5284 Py_ssize_t *consumed)
5285{
5286 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5287}
5288
5289
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005290/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5291 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005292
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005293 On success, write a pointer to a newly allocated wide character string into
5294 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5295 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005296
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005297 On memory allocation failure, return -1.
5298
5299 On decoding error (if surrogateescape is zero), return -2. If wlen is
5300 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5301 is not NULL, write the decoding error message into *reason. */
5302int
5303_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005304 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005305{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005306 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005307 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005308 wchar_t *unicode;
5309 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005310
Victor Stinner3d4226a2018-08-29 22:21:32 +02005311 int surrogateescape = 0;
5312 int surrogatepass = 0;
5313 switch (errors)
5314 {
5315 case _Py_ERROR_STRICT:
5316 break;
5317 case _Py_ERROR_SURROGATEESCAPE:
5318 surrogateescape = 1;
5319 break;
5320 case _Py_ERROR_SURROGATEPASS:
5321 surrogatepass = 1;
5322 break;
5323 default:
5324 return -3;
5325 }
5326
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005327 /* Note: size will always be longer than the resulting Unicode
5328 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005329 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005330 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005331 }
5332
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005333 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005334 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005335 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005336 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005337
5338 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005339 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005340 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005341 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005342 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005343#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005344 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005345#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005346 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005347#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005348 if (ch > 0xFF) {
5349#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005350 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005351#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005352 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005353 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005354 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5355 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5356#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005357 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005358 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005359 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005360 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005361 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005362
5363 if (surrogateescape) {
5364 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5365 }
5366 else {
5367 /* Is it a valid three-byte code? */
5368 if (surrogatepass
5369 && (e - s) >= 3
5370 && (s[0] & 0xf0) == 0xe0
5371 && (s[1] & 0xc0) == 0x80
5372 && (s[2] & 0xc0) == 0x80)
5373 {
5374 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5375 s += 3;
5376 unicode[outpos++] = ch;
5377 }
5378 else {
5379 PyMem_RawFree(unicode );
5380 if (reason != NULL) {
5381 switch (ch) {
5382 case 0:
5383 *reason = "unexpected end of data";
5384 break;
5385 case 1:
5386 *reason = "invalid start byte";
5387 break;
5388 /* 2, 3, 4 */
5389 default:
5390 *reason = "invalid continuation byte";
5391 break;
5392 }
5393 }
5394 if (wlen != NULL) {
5395 *wlen = s - orig_s;
5396 }
5397 return -2;
5398 }
5399 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005400 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005401 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005402 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005403 if (wlen) {
5404 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005405 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005406 *wstr = unicode;
5407 return 0;
5408}
5409
Victor Stinner5f9cf232019-03-19 01:46:25 +01005410
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005411wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005412_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5413 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005414{
5415 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005416 int res = _Py_DecodeUTF8Ex(arg, arglen,
5417 &wstr, wlen,
5418 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005419 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005420 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5421 assert(res != -3);
5422 if (wlen) {
5423 *wlen = (size_t)res;
5424 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005425 return NULL;
5426 }
5427 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005428}
5429
Antoine Pitrouab868312009-01-10 15:40:25 +00005430
Victor Stinnere47e6982017-12-21 15:45:16 +01005431/* UTF-8 encoder using the surrogateescape error handler .
5432
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005433 On success, return 0 and write the newly allocated character string (use
5434 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005435
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005436 On encoding failure, return -2 and write the position of the invalid
5437 surrogate character into *error_pos (if error_pos is set) and the decoding
5438 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005439
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005440 On memory allocation failure, return -1. */
5441int
5442_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005443 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005444{
5445 const Py_ssize_t max_char_size = 4;
5446 Py_ssize_t len = wcslen(text);
5447
5448 assert(len >= 0);
5449
Victor Stinner3d4226a2018-08-29 22:21:32 +02005450 int surrogateescape = 0;
5451 int surrogatepass = 0;
5452 switch (errors)
5453 {
5454 case _Py_ERROR_STRICT:
5455 break;
5456 case _Py_ERROR_SURROGATEESCAPE:
5457 surrogateescape = 1;
5458 break;
5459 case _Py_ERROR_SURROGATEPASS:
5460 surrogatepass = 1;
5461 break;
5462 default:
5463 return -3;
5464 }
5465
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005466 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5467 return -1;
5468 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005469 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005470 if (raw_malloc) {
5471 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005472 }
5473 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005474 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005475 }
5476 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005477 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005478 }
5479
5480 char *p = bytes;
5481 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005482 for (i = 0; i < len; ) {
5483 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005484 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005485 i++;
5486#if Py_UNICODE_SIZE == 2
5487 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5488 && i < len
5489 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5490 {
5491 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5492 i++;
5493 }
5494#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005495
5496 if (ch < 0x80) {
5497 /* Encode ASCII */
5498 *p++ = (char) ch;
5499
5500 }
5501 else if (ch < 0x0800) {
5502 /* Encode Latin-1 */
5503 *p++ = (char)(0xc0 | (ch >> 6));
5504 *p++ = (char)(0x80 | (ch & 0x3f));
5505 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005506 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005507 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005508 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005509 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005510 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005511 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005512 if (reason != NULL) {
5513 *reason = "encoding error";
5514 }
5515 if (raw_malloc) {
5516 PyMem_RawFree(bytes);
5517 }
5518 else {
5519 PyMem_Free(bytes);
5520 }
5521 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005522 }
5523 *p++ = (char)(ch & 0xff);
5524 }
5525 else if (ch < 0x10000) {
5526 *p++ = (char)(0xe0 | (ch >> 12));
5527 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5528 *p++ = (char)(0x80 | (ch & 0x3f));
5529 }
5530 else { /* ch >= 0x10000 */
5531 assert(ch <= MAX_UNICODE);
5532 /* Encode UCS4 Unicode ordinals */
5533 *p++ = (char)(0xf0 | (ch >> 18));
5534 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5535 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5536 *p++ = (char)(0x80 | (ch & 0x3f));
5537 }
5538 }
5539 *p++ = '\0';
5540
5541 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005542 char *bytes2;
5543 if (raw_malloc) {
5544 bytes2 = PyMem_RawRealloc(bytes, final_size);
5545 }
5546 else {
5547 bytes2 = PyMem_Realloc(bytes, final_size);
5548 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005549 if (bytes2 == NULL) {
5550 if (error_pos != NULL) {
5551 *error_pos = (size_t)-1;
5552 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005553 if (raw_malloc) {
5554 PyMem_RawFree(bytes);
5555 }
5556 else {
5557 PyMem_Free(bytes);
5558 }
5559 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005560 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005561 *str = bytes2;
5562 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005563}
5564
5565
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005566/* Primary internal function which creates utf8 encoded bytes objects.
5567
5568 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005569 and allocate exactly as much space needed at the end. Else allocate the
5570 maximum possible needed (4 result bytes per Unicode character), and return
5571 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005572*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005573static PyObject *
5574unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5575 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005577 if (!PyUnicode_Check(unicode)) {
5578 PyErr_BadArgument();
5579 return NULL;
5580 }
5581
5582 if (PyUnicode_READY(unicode) == -1)
5583 return NULL;
5584
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005585 if (PyUnicode_UTF8(unicode))
5586 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5587 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005588
Inada Naoki02a4d572020-02-27 13:48:59 +09005589 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005590 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005591 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5592
5593 _PyBytesWriter writer;
5594 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005595
Benjamin Petersonead6b532011-12-20 17:23:42 -06005596 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005597 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005598 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005599 case PyUnicode_1BYTE_KIND:
5600 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5601 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005602 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5603 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005604 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005605 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5606 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005607 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005608 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5609 break;
Tim Peters602f7402002-04-27 18:03:26 +00005610 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005611
5612 if (end == NULL) {
5613 _PyBytesWriter_Dealloc(&writer);
5614 return NULL;
5615 }
5616 return _PyBytesWriter_Finish(&writer, end);
5617}
5618
5619static int
5620unicode_fill_utf8(PyObject *unicode)
5621{
5622 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5623 assert(!PyUnicode_IS_ASCII(unicode));
5624
5625 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005626 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005627 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5628
5629 _PyBytesWriter writer;
5630 char *end;
5631
5632 switch (kind) {
5633 default:
5634 Py_UNREACHABLE();
5635 case PyUnicode_1BYTE_KIND:
5636 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5637 _Py_ERROR_STRICT, NULL);
5638 break;
5639 case PyUnicode_2BYTE_KIND:
5640 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5641 _Py_ERROR_STRICT, NULL);
5642 break;
5643 case PyUnicode_4BYTE_KIND:
5644 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5645 _Py_ERROR_STRICT, NULL);
5646 break;
5647 }
5648 if (end == NULL) {
5649 _PyBytesWriter_Dealloc(&writer);
5650 return -1;
5651 }
5652
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03005653 const char *start = writer.use_small_buffer ? writer.small_buffer :
Inada Naoki02a4d572020-02-27 13:48:59 +09005654 PyBytes_AS_STRING(writer.buffer);
5655 Py_ssize_t len = end - start;
5656
Victor Stinner32bd68c2020-12-01 10:37:39 +01005657 char *cache = PyObject_Malloc(len + 1);
Inada Naoki02a4d572020-02-27 13:48:59 +09005658 if (cache == NULL) {
5659 _PyBytesWriter_Dealloc(&writer);
5660 PyErr_NoMemory();
5661 return -1;
5662 }
5663 _PyUnicode_UTF8(unicode) = cache;
5664 _PyUnicode_UTF8_LENGTH(unicode) = len;
5665 memcpy(cache, start, len);
5666 cache[len] = '\0';
5667 _PyBytesWriter_Dealloc(&writer);
5668 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669}
5670
Alexander Belopolsky40018472011-02-26 01:02:56 +00005671PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005672_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5673{
5674 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5675}
5676
5677
5678PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005679PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5680 Py_ssize_t size,
5681 const char *errors)
5682{
5683 PyObject *v, *unicode;
5684
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005685 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005686 if (unicode == NULL)
5687 return NULL;
5688 v = _PyUnicode_AsUTF8String(unicode, errors);
5689 Py_DECREF(unicode);
5690 return v;
5691}
5692
5693PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005694PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005696 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697}
5698
Walter Dörwald41980ca2007-08-16 21:55:45 +00005699/* --- UTF-32 Codec ------------------------------------------------------- */
5700
5701PyObject *
5702PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005703 Py_ssize_t size,
5704 const char *errors,
5705 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005706{
5707 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5708}
5709
5710PyObject *
5711PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005712 Py_ssize_t size,
5713 const char *errors,
5714 int *byteorder,
5715 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005716{
5717 const char *starts = s;
5718 Py_ssize_t startinpos;
5719 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005720 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005721 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005722 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005723 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005724 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005725 PyObject *errorHandler = NULL;
5726 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005727
Andy Lestere6be9b52020-02-11 20:28:35 -06005728 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005729 e = q + size;
5730
5731 if (byteorder)
5732 bo = *byteorder;
5733
5734 /* Check for BOM marks (U+FEFF) in the input and adjust current
5735 byte order setting accordingly. In native mode, the leading BOM
5736 mark is skipped, in all other modes, it is copied to the output
5737 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005738 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005739 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005740 if (bom == 0x0000FEFF) {
5741 bo = -1;
5742 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005743 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005744 else if (bom == 0xFFFE0000) {
5745 bo = 1;
5746 q += 4;
5747 }
5748 if (byteorder)
5749 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005750 }
5751
Victor Stinnere64322e2012-10-30 23:12:47 +01005752 if (q == e) {
5753 if (consumed)
5754 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005755 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005756 }
5757
Victor Stinnere64322e2012-10-30 23:12:47 +01005758#ifdef WORDS_BIGENDIAN
5759 le = bo < 0;
5760#else
5761 le = bo <= 0;
5762#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005763 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005764
Victor Stinner8f674cc2013-04-17 23:02:17 +02005765 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005766 writer.min_length = (e - q + 3) / 4;
5767 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005768 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005769
Victor Stinnere64322e2012-10-30 23:12:47 +01005770 while (1) {
5771 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005772 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005773
Victor Stinnere64322e2012-10-30 23:12:47 +01005774 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005775 enum PyUnicode_Kind kind = writer.kind;
5776 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005777 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005778 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005779 if (le) {
5780 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005781 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005782 if (ch > maxch)
5783 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005784 if (kind != PyUnicode_1BYTE_KIND &&
5785 Py_UNICODE_IS_SURROGATE(ch))
5786 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005787 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005788 q += 4;
5789 } while (q <= last);
5790 }
5791 else {
5792 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005793 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005794 if (ch > maxch)
5795 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005796 if (kind != PyUnicode_1BYTE_KIND &&
5797 Py_UNICODE_IS_SURROGATE(ch))
5798 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005799 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005800 q += 4;
5801 } while (q <= last);
5802 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005803 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005804 }
5805
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005806 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005807 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005808 startinpos = ((const char *)q) - starts;
5809 endinpos = startinpos + 4;
5810 }
5811 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005812 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005813 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005814 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005815 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005816 startinpos = ((const char *)q) - starts;
5817 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005818 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005819 else {
5820 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005821 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005822 goto onError;
5823 q += 4;
5824 continue;
5825 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005826 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005827 startinpos = ((const char *)q) - starts;
5828 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005829 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005830
5831 /* The remaining input chars are ignored if the callback
5832 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005833 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005834 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005835 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005836 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005837 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005838 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005839 }
5840
Walter Dörwald41980ca2007-08-16 21:55:45 +00005841 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005842 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005843
Walter Dörwald41980ca2007-08-16 21:55:45 +00005844 Py_XDECREF(errorHandler);
5845 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005846 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005847
Benjamin Peterson29060642009-01-31 22:14:21 +00005848 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005849 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005850 Py_XDECREF(errorHandler);
5851 Py_XDECREF(exc);
5852 return NULL;
5853}
5854
5855PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005856_PyUnicode_EncodeUTF32(PyObject *str,
5857 const char *errors,
5858 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005859{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005860 enum PyUnicode_Kind kind;
5861 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005862 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005863 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005864 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005865#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005866 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005867#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005868 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005869#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005870 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005871 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005872 PyObject *errorHandler = NULL;
5873 PyObject *exc = NULL;
5874 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005875
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005876 if (!PyUnicode_Check(str)) {
5877 PyErr_BadArgument();
5878 return NULL;
5879 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005880 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005881 return NULL;
5882 kind = PyUnicode_KIND(str);
5883 data = PyUnicode_DATA(str);
5884 len = PyUnicode_GET_LENGTH(str);
5885
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005886 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005887 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005888 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005889 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005890 if (v == NULL)
5891 return NULL;
5892
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005893 /* output buffer is 4-bytes aligned */
5894 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005895 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005896 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005897 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005898 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005899 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005900
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005901 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005902 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005903 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005904 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005905 else
5906 encoding = "utf-32";
5907
5908 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005909 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5910 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005911 }
5912
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005913 pos = 0;
5914 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005915 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005916
5917 if (kind == PyUnicode_2BYTE_KIND) {
5918 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5919 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005920 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005921 else {
5922 assert(kind == PyUnicode_4BYTE_KIND);
5923 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5924 &out, native_ordering);
5925 }
5926 if (pos == len)
5927 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005928
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005929 rep = unicode_encode_call_errorhandler(
5930 errors, &errorHandler,
5931 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005932 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005933 if (!rep)
5934 goto error;
5935
5936 if (PyBytes_Check(rep)) {
5937 repsize = PyBytes_GET_SIZE(rep);
5938 if (repsize & 3) {
5939 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005940 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005941 "surrogates not allowed");
5942 goto error;
5943 }
5944 moreunits = repsize / 4;
5945 }
5946 else {
5947 assert(PyUnicode_Check(rep));
5948 if (PyUnicode_READY(rep) < 0)
5949 goto error;
5950 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5951 if (!PyUnicode_IS_ASCII(rep)) {
5952 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005953 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005954 "surrogates not allowed");
5955 goto error;
5956 }
5957 }
5958
5959 /* four bytes are reserved for each surrogate */
5960 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005961 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005962 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005963 /* integer overflow */
5964 PyErr_NoMemory();
5965 goto error;
5966 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005967 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005968 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005969 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005970 }
5971
5972 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005973 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005974 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005975 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005976 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005977 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5978 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005979 }
5980
5981 Py_CLEAR(rep);
5982 }
5983
5984 /* Cut back to size actually needed. This is necessary for, for example,
5985 encoding of a string containing isolated surrogates and the 'ignore'
5986 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005987 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005988 if (nsize != PyBytes_GET_SIZE(v))
5989 _PyBytes_Resize(&v, nsize);
5990 Py_XDECREF(errorHandler);
5991 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005992 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005993 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005994 error:
5995 Py_XDECREF(rep);
5996 Py_XDECREF(errorHandler);
5997 Py_XDECREF(exc);
5998 Py_XDECREF(v);
5999 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00006000}
6001
Alexander Belopolsky40018472011-02-26 01:02:56 +00006002PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006003PyUnicode_EncodeUTF32(const Py_UNICODE *s,
6004 Py_ssize_t size,
6005 const char *errors,
6006 int byteorder)
6007{
6008 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006009 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006010 if (tmp == NULL)
6011 return NULL;
6012 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
6013 Py_DECREF(tmp);
6014 return result;
6015}
6016
6017PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006018PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00006019{
Victor Stinnerb960b342011-11-20 19:12:52 +01006020 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00006021}
6022
Guido van Rossumd57fd912000-03-10 22:53:23 +00006023/* --- UTF-16 Codec ------------------------------------------------------- */
6024
Tim Peters772747b2001-08-09 22:21:55 +00006025PyObject *
6026PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006027 Py_ssize_t size,
6028 const char *errors,
6029 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030{
Walter Dörwald69652032004-09-07 20:24:22 +00006031 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6032}
6033
6034PyObject *
6035PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006036 Py_ssize_t size,
6037 const char *errors,
6038 int *byteorder,
6039 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00006040{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006041 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006042 Py_ssize_t startinpos;
6043 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006044 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006045 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00006046 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02006047 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00006048 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006049 PyObject *errorHandler = NULL;
6050 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006051 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006052
Andy Lestere6be9b52020-02-11 20:28:35 -06006053 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006054 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055
6056 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00006057 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00006059 /* Check for BOM marks (U+FEFF) in the input and adjust current
6060 byte order setting accordingly. In native mode, the leading BOM
6061 mark is skipped, in all other modes, it is copied to the output
6062 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02006063 if (bo == 0 && size >= 2) {
6064 const Py_UCS4 bom = (q[1] << 8) | q[0];
6065 if (bom == 0xFEFF) {
6066 q += 2;
6067 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006068 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02006069 else if (bom == 0xFFFE) {
6070 q += 2;
6071 bo = 1;
6072 }
6073 if (byteorder)
6074 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00006075 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076
Antoine Pitrou63065d72012-05-15 23:48:04 +02006077 if (q == e) {
6078 if (consumed)
6079 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006080 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00006081 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02006082
Christian Heimes743e0cd2012-10-17 23:52:17 +02006083#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02006084 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006085 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00006086#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02006087 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006088 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00006089#endif
Tim Peters772747b2001-08-09 22:21:55 +00006090
Antoine Pitrou63065d72012-05-15 23:48:04 +02006091 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08006092 character count normally. Error handler will take care of
6093 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006094 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006095 writer.min_length = (e - q + 1) / 2;
6096 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006097 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006098
Antoine Pitrou63065d72012-05-15 23:48:04 +02006099 while (1) {
6100 Py_UCS4 ch = 0;
6101 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006102 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006103 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006104 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02006105 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006106 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006107 native_ordering);
6108 else
6109 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006110 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006111 native_ordering);
6112 } else if (kind == PyUnicode_2BYTE_KIND) {
6113 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006114 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006115 native_ordering);
6116 } else {
6117 assert(kind == PyUnicode_4BYTE_KIND);
6118 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006119 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006120 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00006121 }
Antoine Pitrouab868312009-01-10 15:40:25 +00006122 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006123
Antoine Pitrou63065d72012-05-15 23:48:04 +02006124 switch (ch)
6125 {
6126 case 0:
6127 /* remaining byte at the end? (size should be even) */
6128 if (q == e || consumed)
6129 goto End;
6130 errmsg = "truncated data";
6131 startinpos = ((const char *)q) - starts;
6132 endinpos = ((const char *)e) - starts;
6133 break;
6134 /* The remaining input chars are ignored if the callback
6135 chooses to skip the input */
6136 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006137 q -= 2;
6138 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02006139 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006140 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006141 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006142 endinpos = ((const char *)e) - starts;
6143 break;
6144 case 2:
6145 errmsg = "illegal encoding";
6146 startinpos = ((const char *)q) - 2 - starts;
6147 endinpos = startinpos + 2;
6148 break;
6149 case 3:
6150 errmsg = "illegal UTF-16 surrogate";
6151 startinpos = ((const char *)q) - 4 - starts;
6152 endinpos = startinpos + 2;
6153 break;
6154 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006155 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006156 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006157 continue;
6158 }
6159
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006160 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00006161 errors,
6162 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006163 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00006164 &starts,
6165 (const char **)&e,
6166 &startinpos,
6167 &endinpos,
6168 &exc,
6169 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006170 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006171 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006172 }
6173
Antoine Pitrou63065d72012-05-15 23:48:04 +02006174End:
Walter Dörwald69652032004-09-07 20:24:22 +00006175 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006176 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00006177
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006178 Py_XDECREF(errorHandler);
6179 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006180 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181
Benjamin Peterson29060642009-01-31 22:14:21 +00006182 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006183 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006184 Py_XDECREF(errorHandler);
6185 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006186 return NULL;
6187}
6188
Tim Peters772747b2001-08-09 22:21:55 +00006189PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006190_PyUnicode_EncodeUTF16(PyObject *str,
6191 const char *errors,
6192 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006194 enum PyUnicode_Kind kind;
6195 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006196 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006197 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006198 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006199 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02006200#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006201 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006202#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006203 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006204#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006205 const char *encoding;
6206 Py_ssize_t nsize, pos;
6207 PyObject *errorHandler = NULL;
6208 PyObject *exc = NULL;
6209 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006210
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006211 if (!PyUnicode_Check(str)) {
6212 PyErr_BadArgument();
6213 return NULL;
6214 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006215 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006216 return NULL;
6217 kind = PyUnicode_KIND(str);
6218 data = PyUnicode_DATA(str);
6219 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006220
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006221 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006222 if (kind == PyUnicode_4BYTE_KIND) {
6223 const Py_UCS4 *in = (const Py_UCS4 *)data;
6224 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006225 while (in < end) {
6226 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006227 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006228 }
6229 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006230 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006231 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006232 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006233 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006234 nsize = len + pairs + (byteorder == 0);
6235 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006236 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006238 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006240 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006241 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006242 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006243 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006244 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006245 }
6246 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006247 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006248 }
Tim Peters772747b2001-08-09 22:21:55 +00006249
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006250 if (kind == PyUnicode_1BYTE_KIND) {
6251 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6252 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006253 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006254
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006255 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006256 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006257 }
6258 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006259 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006260 }
6261 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006262 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006263 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006264
6265 pos = 0;
6266 while (pos < len) {
6267 Py_ssize_t repsize, moreunits;
6268
6269 if (kind == PyUnicode_2BYTE_KIND) {
6270 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6271 &out, native_ordering);
6272 }
6273 else {
6274 assert(kind == PyUnicode_4BYTE_KIND);
6275 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6276 &out, native_ordering);
6277 }
6278 if (pos == len)
6279 break;
6280
6281 rep = unicode_encode_call_errorhandler(
6282 errors, &errorHandler,
6283 encoding, "surrogates not allowed",
6284 str, &exc, pos, pos + 1, &pos);
6285 if (!rep)
6286 goto error;
6287
6288 if (PyBytes_Check(rep)) {
6289 repsize = PyBytes_GET_SIZE(rep);
6290 if (repsize & 1) {
6291 raise_encode_exception(&exc, encoding,
6292 str, pos - 1, pos,
6293 "surrogates not allowed");
6294 goto error;
6295 }
6296 moreunits = repsize / 2;
6297 }
6298 else {
6299 assert(PyUnicode_Check(rep));
6300 if (PyUnicode_READY(rep) < 0)
6301 goto error;
6302 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6303 if (!PyUnicode_IS_ASCII(rep)) {
6304 raise_encode_exception(&exc, encoding,
6305 str, pos - 1, pos,
6306 "surrogates not allowed");
6307 goto error;
6308 }
6309 }
6310
6311 /* two bytes are reserved for each surrogate */
6312 if (moreunits > 1) {
6313 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006314 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006315 /* integer overflow */
6316 PyErr_NoMemory();
6317 goto error;
6318 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006319 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006320 goto error;
6321 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6322 }
6323
6324 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006325 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006326 out += moreunits;
6327 } else /* rep is unicode */ {
6328 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6329 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6330 &out, native_ordering);
6331 }
6332
6333 Py_CLEAR(rep);
6334 }
6335
6336 /* Cut back to size actually needed. This is necessary for, for example,
6337 encoding of a string containing isolated surrogates and the 'ignore' handler
6338 is used. */
6339 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6340 if (nsize != PyBytes_GET_SIZE(v))
6341 _PyBytes_Resize(&v, nsize);
6342 Py_XDECREF(errorHandler);
6343 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006344 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006345 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006346 error:
6347 Py_XDECREF(rep);
6348 Py_XDECREF(errorHandler);
6349 Py_XDECREF(exc);
6350 Py_XDECREF(v);
6351 return NULL;
6352#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006353}
6354
Alexander Belopolsky40018472011-02-26 01:02:56 +00006355PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006356PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6357 Py_ssize_t size,
6358 const char *errors,
6359 int byteorder)
6360{
6361 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006362 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006363 if (tmp == NULL)
6364 return NULL;
6365 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6366 Py_DECREF(tmp);
6367 return result;
6368}
6369
6370PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006371PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006372{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006373 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006374}
6375
6376/* --- Unicode Escape Codec ----------------------------------------------- */
6377
Victor Stinner47e1afd2020-10-26 16:43:47 +01006378static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006379
Alexander Belopolsky40018472011-02-26 01:02:56 +00006380PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006381_PyUnicode_DecodeUnicodeEscape(const char *s,
6382 Py_ssize_t size,
6383 const char *errors,
6384 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006385{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006386 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006387 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006388 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006389 PyObject *errorHandler = NULL;
6390 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006391
Eric V. Smith42454af2016-10-31 09:22:08 -04006392 // so we can remember if we've seen an invalid escape char or not
6393 *first_invalid_escape = NULL;
6394
Victor Stinner62ec3312016-09-06 17:04:34 -07006395 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006396 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006397 }
6398 /* Escaped strings will always be longer than the resulting
6399 Unicode string, so we start with size here and then reduce the
6400 length after conversion to the true value.
6401 (but if the error callback returns a long replacement string
6402 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006403 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006404 writer.min_length = size;
6405 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6406 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006407 }
6408
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409 end = s + size;
6410 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006411 unsigned char c = (unsigned char) *s++;
6412 Py_UCS4 ch;
6413 int count;
6414 Py_ssize_t startinpos;
6415 Py_ssize_t endinpos;
6416 const char *message;
6417
6418#define WRITE_ASCII_CHAR(ch) \
6419 do { \
6420 assert(ch <= 127); \
6421 assert(writer.pos < writer.size); \
6422 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6423 } while(0)
6424
6425#define WRITE_CHAR(ch) \
6426 do { \
6427 if (ch <= writer.maxchar) { \
6428 assert(writer.pos < writer.size); \
6429 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6430 } \
6431 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6432 goto onError; \
6433 } \
6434 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435
6436 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006437 if (c != '\\') {
6438 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439 continue;
6440 }
6441
Victor Stinner62ec3312016-09-06 17:04:34 -07006442 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006443 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006444 if (s >= end) {
6445 message = "\\ at end of string";
6446 goto error;
6447 }
6448 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006449
Victor Stinner62ec3312016-09-06 17:04:34 -07006450 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006451 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006452
Benjamin Peterson29060642009-01-31 22:14:21 +00006453 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006454 case '\n': continue;
6455 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6456 case '\'': WRITE_ASCII_CHAR('\''); continue;
6457 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6458 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006459 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006460 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6461 case 't': WRITE_ASCII_CHAR('\t'); continue;
6462 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6463 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006464 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006465 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006466 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006467 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468
Benjamin Peterson29060642009-01-31 22:14:21 +00006469 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470 case '0': case '1': case '2': case '3':
6471 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006472 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006473 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006474 ch = (ch<<3) + *s++ - '0';
6475 if (s < end && '0' <= *s && *s <= '7') {
6476 ch = (ch<<3) + *s++ - '0';
6477 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006479 WRITE_CHAR(ch);
6480 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481
Benjamin Peterson29060642009-01-31 22:14:21 +00006482 /* hex escapes */
6483 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006485 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006486 message = "truncated \\xXX escape";
6487 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006490 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006491 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006492 message = "truncated \\uXXXX escape";
6493 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494
Benjamin Peterson29060642009-01-31 22:14:21 +00006495 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006496 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006497 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006498 message = "truncated \\UXXXXXXXX escape";
6499 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006500 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006501 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006502 ch <<= 4;
6503 if (c >= '0' && c <= '9') {
6504 ch += c - '0';
6505 }
6506 else if (c >= 'a' && c <= 'f') {
6507 ch += c - ('a' - 10);
6508 }
6509 else if (c >= 'A' && c <= 'F') {
6510 ch += c - ('A' - 10);
6511 }
6512 else {
6513 break;
6514 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006515 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006516 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006517 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006518 }
6519
6520 /* when we get here, ch is a 32-bit unicode character */
6521 if (ch > MAX_UNICODE) {
6522 message = "illegal Unicode character";
6523 goto error;
6524 }
6525
6526 WRITE_CHAR(ch);
6527 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006528
Benjamin Peterson29060642009-01-31 22:14:21 +00006529 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006530 case 'N':
Victor Stinner47e1afd2020-10-26 16:43:47 +01006531 if (ucnhash_capi == NULL) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006532 /* load the unicode data module */
Victor Stinner47e1afd2020-10-26 16:43:47 +01006533 ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006534 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner47e1afd2020-10-26 16:43:47 +01006535 if (ucnhash_capi == NULL) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006536 PyErr_SetString(
6537 PyExc_UnicodeError,
6538 "\\N escapes not supported (can't load unicodedata module)"
6539 );
6540 goto onError;
6541 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006542 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006543
6544 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006545 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006546 const char *start = ++s;
6547 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006548 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006549 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006550 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006551 namelen = s - start;
6552 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006553 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006554 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006555 ch = 0xffffffff; /* in case 'getcode' messes up */
6556 if (namelen <= INT_MAX &&
Victor Stinner920cb642020-10-26 19:19:36 +01006557 ucnhash_capi->getcode(start, (int)namelen,
Victor Stinner62ec3312016-09-06 17:04:34 -07006558 &ch, 0)) {
6559 assert(ch <= MAX_UNICODE);
6560 WRITE_CHAR(ch);
6561 continue;
6562 }
6563 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006564 }
6565 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006566 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006567
6568 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006569 if (*first_invalid_escape == NULL) {
6570 *first_invalid_escape = s-1; /* Back up one char, since we've
6571 already incremented s. */
6572 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006573 WRITE_ASCII_CHAR('\\');
6574 WRITE_CHAR(c);
6575 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006576 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006577
6578 error:
6579 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006580 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006581 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006582 errors, &errorHandler,
6583 "unicodeescape", message,
6584 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006585 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006586 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006587 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006588 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006589
6590#undef WRITE_ASCII_CHAR
6591#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006592 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006593
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006594 Py_XDECREF(errorHandler);
6595 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006596 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006597
Benjamin Peterson29060642009-01-31 22:14:21 +00006598 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006599 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006600 Py_XDECREF(errorHandler);
6601 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602 return NULL;
6603}
6604
Eric V. Smith42454af2016-10-31 09:22:08 -04006605PyObject *
6606PyUnicode_DecodeUnicodeEscape(const char *s,
6607 Py_ssize_t size,
6608 const char *errors)
6609{
6610 const char *first_invalid_escape;
6611 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6612 &first_invalid_escape);
6613 if (result == NULL)
6614 return NULL;
6615 if (first_invalid_escape != NULL) {
6616 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6617 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006618 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006619 Py_DECREF(result);
6620 return NULL;
6621 }
6622 }
6623 return result;
6624}
6625
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006626/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627
Alexander Belopolsky40018472011-02-26 01:02:56 +00006628PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006629PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006631 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006632 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006634 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006635 const void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006636 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637
Ezio Melottie7f90372012-10-05 03:33:31 +03006638 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006639 escape.
6640
Ezio Melottie7f90372012-10-05 03:33:31 +03006641 For UCS1 strings it's '\xxx', 4 bytes per source character.
6642 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6643 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006644 */
6645
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006646 if (!PyUnicode_Check(unicode)) {
6647 PyErr_BadArgument();
6648 return NULL;
6649 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006650 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006651 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006652 }
Victor Stinner358af132015-10-12 22:36:57 +02006653
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006654 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006655 if (len == 0) {
6656 return PyBytes_FromStringAndSize(NULL, 0);
6657 }
6658
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006659 kind = PyUnicode_KIND(unicode);
6660 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006661 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6662 bytes, and 1 byte characters 4. */
6663 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006664 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006665 return PyErr_NoMemory();
6666 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006667 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006668 if (repr == NULL) {
6669 return NULL;
6670 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006671
Victor Stinner62ec3312016-09-06 17:04:34 -07006672 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006673 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006674 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006675
Victor Stinner62ec3312016-09-06 17:04:34 -07006676 /* U+0000-U+00ff range */
6677 if (ch < 0x100) {
6678 if (ch >= ' ' && ch < 127) {
6679 if (ch != '\\') {
6680 /* Copy printable US ASCII as-is */
6681 *p++ = (char) ch;
6682 }
6683 /* Escape backslashes */
6684 else {
6685 *p++ = '\\';
6686 *p++ = '\\';
6687 }
6688 }
Victor Stinner358af132015-10-12 22:36:57 +02006689
Victor Stinner62ec3312016-09-06 17:04:34 -07006690 /* Map special whitespace to '\t', \n', '\r' */
6691 else if (ch == '\t') {
6692 *p++ = '\\';
6693 *p++ = 't';
6694 }
6695 else if (ch == '\n') {
6696 *p++ = '\\';
6697 *p++ = 'n';
6698 }
6699 else if (ch == '\r') {
6700 *p++ = '\\';
6701 *p++ = 'r';
6702 }
6703
6704 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6705 else {
6706 *p++ = '\\';
6707 *p++ = 'x';
6708 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6709 *p++ = Py_hexdigits[ch & 0x000F];
6710 }
Tim Petersced69f82003-09-16 20:30:58 +00006711 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006712 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006713 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006714 *p++ = '\\';
6715 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006716 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6717 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6718 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6719 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006721 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6722 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006723
Victor Stinner62ec3312016-09-06 17:04:34 -07006724 /* Make sure that the first two digits are zero */
6725 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006726 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006727 *p++ = 'U';
6728 *p++ = '0';
6729 *p++ = '0';
6730 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6731 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6732 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6733 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6734 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6735 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006736 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006738
Victor Stinner62ec3312016-09-06 17:04:34 -07006739 assert(p - PyBytes_AS_STRING(repr) > 0);
6740 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6741 return NULL;
6742 }
6743 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744}
6745
Alexander Belopolsky40018472011-02-26 01:02:56 +00006746PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006747PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6748 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006750 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006751 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006752 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006754 }
6755
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006756 result = PyUnicode_AsUnicodeEscapeString(tmp);
6757 Py_DECREF(tmp);
6758 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759}
6760
6761/* --- Raw Unicode Escape Codec ------------------------------------------- */
6762
Alexander Belopolsky40018472011-02-26 01:02:56 +00006763PyObject *
6764PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006765 Py_ssize_t size,
6766 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006768 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006769 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006771 PyObject *errorHandler = NULL;
6772 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006773
Victor Stinner62ec3312016-09-06 17:04:34 -07006774 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006775 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006776 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006777
Guido van Rossumd57fd912000-03-10 22:53:23 +00006778 /* Escaped strings will always be longer than the resulting
6779 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006780 length after conversion to the true value. (But decoding error
6781 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006782 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006783 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006784 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6785 goto onError;
6786 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006787
Guido van Rossumd57fd912000-03-10 22:53:23 +00006788 end = s + size;
6789 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006790 unsigned char c = (unsigned char) *s++;
6791 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006792 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006793 Py_ssize_t startinpos;
6794 Py_ssize_t endinpos;
6795 const char *message;
6796
6797#define WRITE_CHAR(ch) \
6798 do { \
6799 if (ch <= writer.maxchar) { \
6800 assert(writer.pos < writer.size); \
6801 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6802 } \
6803 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6804 goto onError; \
6805 } \
6806 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006807
Benjamin Peterson29060642009-01-31 22:14:21 +00006808 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006809 if (c != '\\' || s >= end) {
6810 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006811 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006812 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006813
Victor Stinner62ec3312016-09-06 17:04:34 -07006814 c = (unsigned char) *s++;
6815 if (c == 'u') {
6816 count = 4;
6817 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006818 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006819 else if (c == 'U') {
6820 count = 8;
6821 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006822 }
6823 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006824 assert(writer.pos < writer.size);
6825 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6826 WRITE_CHAR(c);
6827 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006828 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006829 startinpos = s - starts - 2;
6830
6831 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6832 for (ch = 0; count && s < end; ++s, --count) {
6833 c = (unsigned char)*s;
6834 ch <<= 4;
6835 if (c >= '0' && c <= '9') {
6836 ch += c - '0';
6837 }
6838 else if (c >= 'a' && c <= 'f') {
6839 ch += c - ('a' - 10);
6840 }
6841 else if (c >= 'A' && c <= 'F') {
6842 ch += c - ('A' - 10);
6843 }
6844 else {
6845 break;
6846 }
6847 }
6848 if (!count) {
6849 if (ch <= MAX_UNICODE) {
6850 WRITE_CHAR(ch);
6851 continue;
6852 }
6853 message = "\\Uxxxxxxxx out of range";
6854 }
6855
6856 endinpos = s-starts;
6857 writer.min_length = end - s + writer.pos;
6858 if (unicode_decode_call_errorhandler_writer(
6859 errors, &errorHandler,
6860 "rawunicodeescape", message,
6861 &starts, &end, &startinpos, &endinpos, &exc, &s,
6862 &writer)) {
6863 goto onError;
6864 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006865 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006866
6867#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006868 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006869 Py_XDECREF(errorHandler);
6870 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006871 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006872
Benjamin Peterson29060642009-01-31 22:14:21 +00006873 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006874 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006875 Py_XDECREF(errorHandler);
6876 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006878
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879}
6880
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006881
Alexander Belopolsky40018472011-02-26 01:02:56 +00006882PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006883PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006884{
Victor Stinner62ec3312016-09-06 17:04:34 -07006885 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006886 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006887 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006888 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006889 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006890 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006891
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006892 if (!PyUnicode_Check(unicode)) {
6893 PyErr_BadArgument();
6894 return NULL;
6895 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006896 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006897 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006898 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006899 kind = PyUnicode_KIND(unicode);
6900 data = PyUnicode_DATA(unicode);
6901 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006902 if (kind == PyUnicode_1BYTE_KIND) {
6903 return PyBytes_FromStringAndSize(data, len);
6904 }
Victor Stinner0e368262011-11-10 20:12:49 +01006905
Victor Stinner62ec3312016-09-06 17:04:34 -07006906 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6907 bytes, and 1 byte characters 4. */
6908 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006909
Victor Stinner62ec3312016-09-06 17:04:34 -07006910 if (len > PY_SSIZE_T_MAX / expandsize) {
6911 return PyErr_NoMemory();
6912 }
6913 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6914 if (repr == NULL) {
6915 return NULL;
6916 }
6917 if (len == 0) {
6918 return repr;
6919 }
6920
6921 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006922 for (pos = 0; pos < len; pos++) {
6923 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006924
Victor Stinner62ec3312016-09-06 17:04:34 -07006925 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6926 if (ch < 0x100) {
6927 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006928 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006929 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006930 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006931 *p++ = '\\';
6932 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006933 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6934 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6935 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6936 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006937 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006938 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6939 else {
6940 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6941 *p++ = '\\';
6942 *p++ = 'U';
6943 *p++ = '0';
6944 *p++ = '0';
6945 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6946 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6947 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6948 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6949 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6950 *p++ = Py_hexdigits[ch & 15];
6951 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006952 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006953
Victor Stinner62ec3312016-09-06 17:04:34 -07006954 assert(p > PyBytes_AS_STRING(repr));
6955 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6956 return NULL;
6957 }
6958 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006959}
6960
Alexander Belopolsky40018472011-02-26 01:02:56 +00006961PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006962PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6963 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006964{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006965 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006966 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006967 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006968 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006969 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6970 Py_DECREF(tmp);
6971 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972}
6973
6974/* --- Latin-1 Codec ------------------------------------------------------ */
6975
Alexander Belopolsky40018472011-02-26 01:02:56 +00006976PyObject *
6977PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006978 Py_ssize_t size,
6979 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006981 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06006982 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006983}
6984
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006985/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006986static void
6987make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006988 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006989 PyObject *unicode,
6990 Py_ssize_t startpos, Py_ssize_t endpos,
6991 const char *reason)
6992{
6993 if (*exceptionObject == NULL) {
6994 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006995 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006996 encoding, unicode, startpos, endpos, reason);
6997 }
6998 else {
6999 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
7000 goto onError;
7001 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
7002 goto onError;
7003 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
7004 goto onError;
7005 return;
7006 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02007007 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01007008 }
7009}
7010
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007011/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007012static void
7013raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007014 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01007015 PyObject *unicode,
7016 Py_ssize_t startpos, Py_ssize_t endpos,
7017 const char *reason)
7018{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007019 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01007020 encoding, unicode, startpos, endpos, reason);
7021 if (*exceptionObject != NULL)
7022 PyCodec_StrictErrors(*exceptionObject);
7023}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007024
7025/* error handling callback helper:
7026 build arguments, call the callback and check the arguments,
7027 put the result into newpos and return the replacement string, which
7028 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007029static PyObject *
7030unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007031 PyObject **errorHandler,
7032 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007033 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007034 Py_ssize_t startpos, Py_ssize_t endpos,
7035 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007036{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02007037 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007038 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007039 PyObject *restuple;
7040 PyObject *resunicode;
7041
7042 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007043 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007044 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007045 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007046 }
7047
Benjamin Petersonbac79492012-01-14 13:34:47 -05007048 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007049 return NULL;
7050 len = PyUnicode_GET_LENGTH(unicode);
7051
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007052 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007053 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007054 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007055 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007056
Petr Viktorinffd97532020-02-11 17:46:57 +01007057 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007058 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007059 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007060 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007061 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007062 Py_DECREF(restuple);
7063 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007064 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007065 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00007066 &resunicode, newpos)) {
7067 Py_DECREF(restuple);
7068 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007069 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007070 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7071 PyErr_SetString(PyExc_TypeError, &argparse[3]);
7072 Py_DECREF(restuple);
7073 return NULL;
7074 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007075 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007076 *newpos = len + *newpos;
7077 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02007078 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007079 Py_DECREF(restuple);
7080 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007081 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007082 Py_INCREF(resunicode);
7083 Py_DECREF(restuple);
7084 return resunicode;
7085}
7086
Alexander Belopolsky40018472011-02-26 01:02:56 +00007087static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007088unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007089 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02007090 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007091{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007092 /* input state */
7093 Py_ssize_t pos=0, size;
7094 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007095 const void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007096 /* pointer into the output */
7097 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007098 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7099 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02007100 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007101 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02007102 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007103 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007104 /* output object */
7105 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007106
Benjamin Petersonbac79492012-01-14 13:34:47 -05007107 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007108 return NULL;
7109 size = PyUnicode_GET_LENGTH(unicode);
7110 kind = PyUnicode_KIND(unicode);
7111 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007112 /* allocate enough for a simple encoding without
7113 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00007114 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00007115 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007116
7117 _PyBytesWriter_Init(&writer);
7118 str = _PyBytesWriter_Alloc(&writer, size);
7119 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00007120 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007121
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007122 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02007123 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007124
Benjamin Peterson29060642009-01-31 22:14:21 +00007125 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02007126 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007127 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02007128 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007129 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007130 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007131 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02007132 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007133 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007134 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007135 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00007136 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02007137
Benjamin Petersona1c1be42014-09-29 18:18:57 -04007138 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00007139 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02007140
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007141 /* Only overallocate the buffer if it's not the last write */
7142 writer.overallocate = (collend < size);
7143
Benjamin Peterson29060642009-01-31 22:14:21 +00007144 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02007145 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007146 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02007147
7148 switch (error_handler) {
7149 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007150 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007151 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02007152
7153 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02007154 memset(str, '?', collend - collstart);
7155 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02007156 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02007157 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007158 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007159 break;
Victor Stinner50149202015-09-22 00:26:54 +02007160
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007161 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007162 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007163 writer.min_size -= (collend - collstart);
7164 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007165 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007166 if (str == NULL)
7167 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007168 pos = collend;
7169 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007170
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007171 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007172 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007173 writer.min_size -= (collend - collstart);
7174 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007175 unicode, collstart, collend);
7176 if (str == NULL)
7177 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007178 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007179 break;
Victor Stinner50149202015-09-22 00:26:54 +02007180
Victor Stinnerc3713e92015-09-29 12:32:13 +02007181 case _Py_ERROR_SURROGATEESCAPE:
7182 for (i = collstart; i < collend; ++i) {
7183 ch = PyUnicode_READ(kind, data, i);
7184 if (ch < 0xdc80 || 0xdcff < ch) {
7185 /* Not a UTF-8b surrogate */
7186 break;
7187 }
7188 *str++ = (char)(ch - 0xdc00);
7189 ++pos;
7190 }
7191 if (i >= collend)
7192 break;
7193 collstart = pos;
7194 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02007195 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02007196
Benjamin Peterson29060642009-01-31 22:14:21 +00007197 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007198 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7199 encoding, reason, unicode, &exc,
7200 collstart, collend, &newpos);
7201 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007202 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02007203
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007204 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08007205 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02007206
Victor Stinner6bd525b2015-10-09 13:10:05 +02007207 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007208 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007209 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007210 PyBytes_AS_STRING(rep),
7211 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007212 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007213 else {
7214 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007215
Victor Stinner6bd525b2015-10-09 13:10:05 +02007216 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007217 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007218
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007219 if (limit == 256 ?
7220 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7221 !PyUnicode_IS_ASCII(rep))
7222 {
7223 /* Not all characters are smaller than limit */
7224 raise_encode_exception(&exc, encoding, unicode,
7225 collstart, collend, reason);
7226 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007227 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007228 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7229 str = _PyBytesWriter_WriteBytes(&writer, str,
7230 PyUnicode_DATA(rep),
7231 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007232 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007233 if (str == NULL)
7234 goto onError;
7235
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007236 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007237 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007238 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007239
7240 /* If overallocation was disabled, ensure that it was the last
7241 write. Otherwise, we missed an optimization */
7242 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007243 }
7244 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007245
Victor Stinner50149202015-09-22 00:26:54 +02007246 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007247 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007248 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007249
7250 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007251 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007252 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007253 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007254 Py_XDECREF(exc);
7255 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007256}
7257
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007258/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007259PyObject *
7260PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007261 Py_ssize_t size,
7262 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007263{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007264 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007265 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007266 if (unicode == NULL)
7267 return NULL;
7268 result = unicode_encode_ucs1(unicode, errors, 256);
7269 Py_DECREF(unicode);
7270 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007271}
7272
Alexander Belopolsky40018472011-02-26 01:02:56 +00007273PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007274_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007275{
7276 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007277 PyErr_BadArgument();
7278 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007280 if (PyUnicode_READY(unicode) == -1)
7281 return NULL;
7282 /* Fast path: if it is a one-byte string, construct
7283 bytes object directly. */
7284 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7285 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7286 PyUnicode_GET_LENGTH(unicode));
7287 /* Non-Latin-1 characters present. Defer to above function to
7288 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007289 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007290}
7291
7292PyObject*
7293PyUnicode_AsLatin1String(PyObject *unicode)
7294{
7295 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007296}
7297
7298/* --- 7-bit ASCII Codec -------------------------------------------------- */
7299
Alexander Belopolsky40018472011-02-26 01:02:56 +00007300PyObject *
7301PyUnicode_DecodeASCII(const char *s,
7302 Py_ssize_t size,
7303 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007305 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007306 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007307 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007308 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007309 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007310
Guido van Rossumd57fd912000-03-10 22:53:23 +00007311 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007312 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007313
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner2f9ada92020-06-24 02:22:21 +02007315 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02007316 return get_latin1_char((unsigned char)s[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02007317 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007318
Inada Naoki770847a2019-06-24 12:30:24 +09007319 // Shortcut for simple case
7320 PyObject *u = PyUnicode_New(size, 127);
7321 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007322 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007323 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007324 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09007325 if (outpos == size) {
7326 return u;
7327 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007328
Inada Naoki770847a2019-06-24 12:30:24 +09007329 _PyUnicodeWriter writer;
7330 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007331 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007332
Inada Naoki770847a2019-06-24 12:30:24 +09007333 s += outpos;
7334 int kind = writer.kind;
7335 void *data = writer.data;
7336 Py_ssize_t startinpos, endinpos;
7337
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007338 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007339 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007340 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007341 PyUnicode_WRITE(kind, data, writer.pos, c);
7342 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007343 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007344 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007345 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007346
7347 /* byte outsize range 0x00..0x7f: call the error handler */
7348
7349 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007350 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007351
7352 switch (error_handler)
7353 {
7354 case _Py_ERROR_REPLACE:
7355 case _Py_ERROR_SURROGATEESCAPE:
7356 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007357 but we may switch to UCS2 at the first write */
7358 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7359 goto onError;
7360 kind = writer.kind;
7361 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007362
7363 if (error_handler == _Py_ERROR_REPLACE)
7364 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7365 else
7366 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7367 writer.pos++;
7368 ++s;
7369 break;
7370
7371 case _Py_ERROR_IGNORE:
7372 ++s;
7373 break;
7374
7375 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007376 startinpos = s-starts;
7377 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007378 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007379 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007380 "ascii", "ordinal not in range(128)",
7381 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007382 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007383 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007384 kind = writer.kind;
7385 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007386 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007387 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007388 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007389 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007390 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007391
Benjamin Peterson29060642009-01-31 22:14:21 +00007392 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007393 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007394 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007395 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007396 return NULL;
7397}
7398
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007399/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007400PyObject *
7401PyUnicode_EncodeASCII(const Py_UNICODE *p,
7402 Py_ssize_t size,
7403 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007404{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007405 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007406 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007407 if (unicode == NULL)
7408 return NULL;
7409 result = unicode_encode_ucs1(unicode, errors, 128);
7410 Py_DECREF(unicode);
7411 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007412}
7413
Alexander Belopolsky40018472011-02-26 01:02:56 +00007414PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007415_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007416{
7417 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007418 PyErr_BadArgument();
7419 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007420 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007421 if (PyUnicode_READY(unicode) == -1)
7422 return NULL;
7423 /* Fast path: if it is an ASCII-only string, construct bytes object
7424 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007425 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007426 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7427 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007428 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007429}
7430
7431PyObject *
7432PyUnicode_AsASCIIString(PyObject *unicode)
7433{
7434 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007435}
7436
Steve Dowercc16be82016-09-08 10:35:16 -07007437#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007438
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007439/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007440
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007441#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007442#define NEED_RETRY
7443#endif
7444
Steve Dower7ebdda02019-08-21 16:22:33 -07007445/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7446 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7447 both cases also and avoids partial characters overrunning the
7448 length limit in MultiByteToWideChar on Windows */
7449#define DECODING_CHUNK_SIZE (INT_MAX/4)
7450
Victor Stinner3a50e702011-10-18 21:21:00 +02007451#ifndef WC_ERR_INVALID_CHARS
7452# define WC_ERR_INVALID_CHARS 0x0080
7453#endif
7454
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007455static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007456code_page_name(UINT code_page, PyObject **obj)
7457{
7458 *obj = NULL;
7459 if (code_page == CP_ACP)
7460 return "mbcs";
7461 if (code_page == CP_UTF7)
7462 return "CP_UTF7";
7463 if (code_page == CP_UTF8)
7464 return "CP_UTF8";
7465
7466 *obj = PyBytes_FromFormat("cp%u", code_page);
7467 if (*obj == NULL)
7468 return NULL;
7469 return PyBytes_AS_STRING(*obj);
7470}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007471
Victor Stinner3a50e702011-10-18 21:21:00 +02007472static DWORD
7473decode_code_page_flags(UINT code_page)
7474{
7475 if (code_page == CP_UTF7) {
7476 /* The CP_UTF7 decoder only supports flags=0 */
7477 return 0;
7478 }
7479 else
7480 return MB_ERR_INVALID_CHARS;
7481}
7482
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007483/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007484 * Decode a byte string from a Windows code page into unicode object in strict
7485 * mode.
7486 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007487 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7488 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007489 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007490static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007491decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007492 wchar_t **buf,
7493 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007494 const char *in,
7495 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007496{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007497 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007498 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007499 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007500
7501 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007502 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007503 while ((outsize = MultiByteToWideChar(code_page, flags,
7504 in, insize, NULL, 0)) <= 0)
7505 {
7506 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7507 goto error;
7508 }
7509 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7510 flags = 0;
7511 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007512
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007513 /* Extend a wchar_t* buffer */
7514 Py_ssize_t n = *bufsize; /* Get the current length */
7515 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7516 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007517 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007518 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007519
7520 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007521 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7522 if (outsize <= 0)
7523 goto error;
7524 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007525
Victor Stinner3a50e702011-10-18 21:21:00 +02007526error:
7527 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7528 return -2;
7529 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007530 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007531}
7532
Victor Stinner3a50e702011-10-18 21:21:00 +02007533/*
7534 * Decode a byte string from a code page into unicode object with an error
7535 * handler.
7536 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007537 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007538 * UnicodeDecodeError exception and returns -1 on error.
7539 */
7540static int
7541decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007542 wchar_t **buf,
7543 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007544 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007545 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007546{
7547 const char *startin = in;
7548 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007549 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007550 /* Ideally, we should get reason from FormatMessage. This is the Windows
7551 2000 English version of the message. */
7552 const char *reason = "No mapping for the Unicode character exists "
7553 "in the target code page.";
7554 /* each step cannot decode more than 1 character, but a character can be
7555 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007556 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007557 int insize;
7558 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007559 PyObject *errorHandler = NULL;
7560 PyObject *exc = NULL;
7561 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007562 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007563 DWORD err;
7564 int ret = -1;
7565
7566 assert(size > 0);
7567
7568 encoding = code_page_name(code_page, &encoding_obj);
7569 if (encoding == NULL)
7570 return -1;
7571
Victor Stinner7d00cc12014-03-17 23:08:06 +01007572 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007573 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7574 UnicodeDecodeError. */
7575 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7576 if (exc != NULL) {
7577 PyCodec_StrictErrors(exc);
7578 Py_CLEAR(exc);
7579 }
7580 goto error;
7581 }
7582
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007583 /* Extend a wchar_t* buffer */
7584 Py_ssize_t n = *bufsize; /* Get the current length */
7585 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7586 PyErr_NoMemory();
7587 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007588 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007589 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7590 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007591 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007592 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007593
7594 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007595 while (in < endin)
7596 {
7597 /* Decode a character */
7598 insize = 1;
7599 do
7600 {
7601 outsize = MultiByteToWideChar(code_page, flags,
7602 in, insize,
7603 buffer, Py_ARRAY_LENGTH(buffer));
7604 if (outsize > 0)
7605 break;
7606 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007607 if (err == ERROR_INVALID_FLAGS && flags) {
7608 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7609 flags = 0;
7610 continue;
7611 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007612 if (err != ERROR_NO_UNICODE_TRANSLATION
7613 && err != ERROR_INSUFFICIENT_BUFFER)
7614 {
7615 PyErr_SetFromWindowsErr(0);
7616 goto error;
7617 }
7618 insize++;
7619 }
7620 /* 4=maximum length of a UTF-8 sequence */
7621 while (insize <= 4 && (in + insize) <= endin);
7622
7623 if (outsize <= 0) {
7624 Py_ssize_t startinpos, endinpos, outpos;
7625
Victor Stinner7d00cc12014-03-17 23:08:06 +01007626 /* last character in partial decode? */
7627 if (in + insize >= endin && !final)
7628 break;
7629
Victor Stinner3a50e702011-10-18 21:21:00 +02007630 startinpos = in - startin;
7631 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007632 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007633 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007634 errors, &errorHandler,
7635 encoding, reason,
7636 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007637 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007638 {
7639 goto error;
7640 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007641 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007642 }
7643 else {
7644 in += insize;
7645 memcpy(out, buffer, outsize * sizeof(wchar_t));
7646 out += outsize;
7647 }
7648 }
7649
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007650 /* Shrink the buffer */
7651 assert(out - *buf <= *bufsize);
7652 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007653 /* (in - startin) <= size and size is an int */
7654 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007655
7656error:
7657 Py_XDECREF(encoding_obj);
7658 Py_XDECREF(errorHandler);
7659 Py_XDECREF(exc);
7660 return ret;
7661}
7662
Victor Stinner3a50e702011-10-18 21:21:00 +02007663static PyObject *
7664decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007665 const char *s, Py_ssize_t size,
7666 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007667{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007668 wchar_t *buf = NULL;
7669 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007670 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007671
Victor Stinner3a50e702011-10-18 21:21:00 +02007672 if (code_page < 0) {
7673 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7674 return NULL;
7675 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007676 if (size < 0) {
7677 PyErr_BadInternalCall();
7678 return NULL;
7679 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007680
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007681 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007682 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007683
Victor Stinner76a31a62011-11-04 00:05:13 +01007684 do
7685 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007686#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007687 if (size > DECODING_CHUNK_SIZE) {
7688 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007689 final = 0;
7690 done = 0;
7691 }
7692 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007693#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007694 {
7695 chunk_size = (int)size;
7696 final = (consumed == NULL);
7697 done = 1;
7698 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007699
Victor Stinner76a31a62011-11-04 00:05:13 +01007700 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007701 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007702 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007703 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007704 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007705
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007706 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007707 s, chunk_size);
7708 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007709 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007710 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007711 errors, final);
7712 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007713
7714 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007715 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007716 return NULL;
7717 }
7718
7719 if (consumed)
7720 *consumed += converted;
7721
7722 s += converted;
7723 size -= converted;
7724 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007725
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007726 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7727 PyMem_Free(buf);
7728 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007729}
7730
Alexander Belopolsky40018472011-02-26 01:02:56 +00007731PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007732PyUnicode_DecodeCodePageStateful(int code_page,
7733 const char *s,
7734 Py_ssize_t size,
7735 const char *errors,
7736 Py_ssize_t *consumed)
7737{
7738 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7739}
7740
7741PyObject *
7742PyUnicode_DecodeMBCSStateful(const char *s,
7743 Py_ssize_t size,
7744 const char *errors,
7745 Py_ssize_t *consumed)
7746{
7747 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7748}
7749
7750PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007751PyUnicode_DecodeMBCS(const char *s,
7752 Py_ssize_t size,
7753 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007754{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007755 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7756}
7757
Victor Stinner3a50e702011-10-18 21:21:00 +02007758static DWORD
7759encode_code_page_flags(UINT code_page, const char *errors)
7760{
7761 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007762 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007763 }
7764 else if (code_page == CP_UTF7) {
7765 /* CP_UTF7 only supports flags=0 */
7766 return 0;
7767 }
7768 else {
7769 if (errors != NULL && strcmp(errors, "replace") == 0)
7770 return 0;
7771 else
7772 return WC_NO_BEST_FIT_CHARS;
7773 }
7774}
7775
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007776/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007777 * Encode a Unicode string to a Windows code page into a byte string in strict
7778 * mode.
7779 *
7780 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007781 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007782 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007783static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007784encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007785 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007786 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007787{
Victor Stinner554f3f02010-06-16 23:33:54 +00007788 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007789 BOOL *pusedDefaultChar = &usedDefaultChar;
7790 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007791 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007792 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007793 const DWORD flags = encode_code_page_flags(code_page, NULL);
7794 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007795 /* Create a substring so that we can get the UTF-16 representation
7796 of just the slice under consideration. */
7797 PyObject *substring;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007798 int ret = -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007799
Martin v. Löwis3d325192011-11-04 18:23:06 +01007800 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007801
Victor Stinner3a50e702011-10-18 21:21:00 +02007802 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007803 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007804 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007805 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007806
Victor Stinner2fc507f2011-11-04 20:06:39 +01007807 substring = PyUnicode_Substring(unicode, offset, offset+len);
7808 if (substring == NULL)
7809 return -1;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007810#if USE_UNICODE_WCHAR_CACHE
7811_Py_COMP_DIAG_PUSH
7812_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Victor Stinner2fc507f2011-11-04 20:06:39 +01007813 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7814 if (p == NULL) {
7815 Py_DECREF(substring);
7816 return -1;
7817 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007818_Py_COMP_DIAG_POP
7819#else /* USE_UNICODE_WCHAR_CACHE */
7820 p = PyUnicode_AsWideCharString(substring, &size);
7821 Py_CLEAR(substring);
7822 if (p == NULL) {
7823 return -1;
7824 }
7825#endif /* USE_UNICODE_WCHAR_CACHE */
Victor Stinner9f067f42013-06-05 00:21:31 +02007826 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007827
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007828 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007829 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007830 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007831 NULL, 0,
7832 NULL, pusedDefaultChar);
7833 if (outsize <= 0)
7834 goto error;
7835 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007836 if (pusedDefaultChar && *pusedDefaultChar) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007837 ret = -2;
7838 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007839 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007840
Victor Stinner3a50e702011-10-18 21:21:00 +02007841 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007842 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007843 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007844 if (*outbytes == NULL) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007845 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007846 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007847 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007848 }
7849 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007850 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007851 const Py_ssize_t n = PyBytes_Size(*outbytes);
7852 if (outsize > PY_SSIZE_T_MAX - n) {
7853 PyErr_NoMemory();
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007854 goto done;
Victor Stinner3a50e702011-10-18 21:21:00 +02007855 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007856 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007857 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007858 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007859 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007860 }
7861
7862 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007863 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007864 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007865 out, outsize,
7866 NULL, pusedDefaultChar);
7867 if (outsize <= 0)
7868 goto error;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007869 if (pusedDefaultChar && *pusedDefaultChar) {
7870 ret = -2;
7871 goto done;
7872 }
7873 ret = 0;
7874
7875done:
7876#if USE_UNICODE_WCHAR_CACHE
7877 Py_DECREF(substring);
7878#else /* USE_UNICODE_WCHAR_CACHE */
7879 PyMem_Free(p);
7880#endif /* USE_UNICODE_WCHAR_CACHE */
7881 return ret;
Victor Stinner554f3f02010-06-16 23:33:54 +00007882
Victor Stinner3a50e702011-10-18 21:21:00 +02007883error:
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007884 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7885 ret = -2;
7886 goto done;
7887 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007888 PyErr_SetFromWindowsErr(0);
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007889 goto done;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007890}
7891
Victor Stinner3a50e702011-10-18 21:21:00 +02007892/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007893 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007894 * error handler.
7895 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007896 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007897 * -1 on other error.
7898 */
7899static int
7900encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007901 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007902 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007903{
Victor Stinner3a50e702011-10-18 21:21:00 +02007904 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007905 Py_ssize_t pos = unicode_offset;
7906 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007907 /* Ideally, we should get reason from FormatMessage. This is the Windows
7908 2000 English version of the message. */
7909 const char *reason = "invalid character";
7910 /* 4=maximum length of a UTF-8 sequence */
7911 char buffer[4];
7912 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7913 Py_ssize_t outsize;
7914 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007915 PyObject *errorHandler = NULL;
7916 PyObject *exc = NULL;
7917 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007918 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007919 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007920 PyObject *rep;
7921 int ret = -1;
7922
7923 assert(insize > 0);
7924
7925 encoding = code_page_name(code_page, &encoding_obj);
7926 if (encoding == NULL)
7927 return -1;
7928
7929 if (errors == NULL || strcmp(errors, "strict") == 0) {
7930 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7931 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007932 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007933 if (exc != NULL) {
7934 PyCodec_StrictErrors(exc);
7935 Py_DECREF(exc);
7936 }
7937 Py_XDECREF(encoding_obj);
7938 return -1;
7939 }
7940
7941 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7942 pusedDefaultChar = &usedDefaultChar;
7943 else
7944 pusedDefaultChar = NULL;
7945
7946 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7947 PyErr_NoMemory();
7948 goto error;
7949 }
7950 outsize = insize * Py_ARRAY_LENGTH(buffer);
7951
7952 if (*outbytes == NULL) {
7953 /* Create string object */
7954 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7955 if (*outbytes == NULL)
7956 goto error;
7957 out = PyBytes_AS_STRING(*outbytes);
7958 }
7959 else {
7960 /* Extend string object */
7961 Py_ssize_t n = PyBytes_Size(*outbytes);
7962 if (n > PY_SSIZE_T_MAX - outsize) {
7963 PyErr_NoMemory();
7964 goto error;
7965 }
7966 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7967 goto error;
7968 out = PyBytes_AS_STRING(*outbytes) + n;
7969 }
7970
7971 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007972 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007973 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007974 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7975 wchar_t chars[2];
7976 int charsize;
7977 if (ch < 0x10000) {
7978 chars[0] = (wchar_t)ch;
7979 charsize = 1;
7980 }
7981 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007982 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7983 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007984 charsize = 2;
7985 }
7986
Victor Stinner3a50e702011-10-18 21:21:00 +02007987 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007988 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007989 buffer, Py_ARRAY_LENGTH(buffer),
7990 NULL, pusedDefaultChar);
7991 if (outsize > 0) {
7992 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7993 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007994 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007995 memcpy(out, buffer, outsize);
7996 out += outsize;
7997 continue;
7998 }
7999 }
8000 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
8001 PyErr_SetFromWindowsErr(0);
8002 goto error;
8003 }
8004
Victor Stinner3a50e702011-10-18 21:21:00 +02008005 rep = unicode_encode_call_errorhandler(
8006 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01008007 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008008 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02008009 if (rep == NULL)
8010 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01008011 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02008012
8013 if (PyBytes_Check(rep)) {
8014 outsize = PyBytes_GET_SIZE(rep);
8015 if (outsize != 1) {
8016 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8017 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
8018 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8019 Py_DECREF(rep);
8020 goto error;
8021 }
8022 out = PyBytes_AS_STRING(*outbytes) + offset;
8023 }
8024 memcpy(out, PyBytes_AS_STRING(rep), outsize);
8025 out += outsize;
8026 }
8027 else {
8028 Py_ssize_t i;
8029 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008030 const void *data;
Victor Stinner3a50e702011-10-18 21:21:00 +02008031
Benjamin Petersonbac79492012-01-14 13:34:47 -05008032 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02008033 Py_DECREF(rep);
8034 goto error;
8035 }
8036
8037 outsize = PyUnicode_GET_LENGTH(rep);
8038 if (outsize != 1) {
8039 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8040 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
8041 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8042 Py_DECREF(rep);
8043 goto error;
8044 }
8045 out = PyBytes_AS_STRING(*outbytes) + offset;
8046 }
8047 kind = PyUnicode_KIND(rep);
8048 data = PyUnicode_DATA(rep);
8049 for (i=0; i < outsize; i++) {
8050 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8051 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008052 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008053 encoding, unicode,
8054 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02008055 "unable to encode error handler result to ASCII");
8056 Py_DECREF(rep);
8057 goto error;
8058 }
8059 *out = (unsigned char)ch;
8060 out++;
8061 }
8062 }
8063 Py_DECREF(rep);
8064 }
8065 /* write a NUL byte */
8066 *out = 0;
8067 outsize = out - PyBytes_AS_STRING(*outbytes);
8068 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
8069 if (_PyBytes_Resize(outbytes, outsize) < 0)
8070 goto error;
8071 ret = 0;
8072
8073error:
8074 Py_XDECREF(encoding_obj);
8075 Py_XDECREF(errorHandler);
8076 Py_XDECREF(exc);
8077 return ret;
8078}
8079
Victor Stinner3a50e702011-10-18 21:21:00 +02008080static PyObject *
8081encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01008082 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02008083 const char *errors)
8084{
Martin v. Löwis3d325192011-11-04 18:23:06 +01008085 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02008086 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01008087 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01008088 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01008089
Victor Stinner29dacf22015-01-26 16:41:32 +01008090 if (!PyUnicode_Check(unicode)) {
8091 PyErr_BadArgument();
8092 return NULL;
8093 }
8094
Benjamin Petersonbac79492012-01-14 13:34:47 -05008095 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01008096 return NULL;
8097 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00008098
Victor Stinner3a50e702011-10-18 21:21:00 +02008099 if (code_page < 0) {
8100 PyErr_SetString(PyExc_ValueError, "invalid code page number");
8101 return NULL;
8102 }
8103
Martin v. Löwis3d325192011-11-04 18:23:06 +01008104 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01008105 return PyBytes_FromStringAndSize(NULL, 0);
8106
Victor Stinner7581cef2011-11-03 22:32:33 +01008107 offset = 0;
8108 do
8109 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008110#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07008111 if (len > DECODING_CHUNK_SIZE) {
8112 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01008113 done = 0;
8114 }
Victor Stinner7581cef2011-11-03 22:32:33 +01008115 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008116#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01008117 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01008118 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01008119 done = 1;
8120 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01008121
Victor Stinner76a31a62011-11-04 00:05:13 +01008122 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008123 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01008124 errors);
8125 if (ret == -2)
8126 ret = encode_code_page_errors(code_page, &outbytes,
8127 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008128 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01008129 if (ret < 0) {
8130 Py_XDECREF(outbytes);
8131 return NULL;
8132 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008133
Victor Stinner7581cef2011-11-03 22:32:33 +01008134 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01008135 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01008136 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008137
Victor Stinner3a50e702011-10-18 21:21:00 +02008138 return outbytes;
8139}
8140
8141PyObject *
8142PyUnicode_EncodeMBCS(const Py_UNICODE *p,
8143 Py_ssize_t size,
8144 const char *errors)
8145{
Victor Stinner7581cef2011-11-03 22:32:33 +01008146 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008147 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01008148 if (unicode == NULL)
8149 return NULL;
8150 res = encode_code_page(CP_ACP, unicode, errors);
8151 Py_DECREF(unicode);
8152 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02008153}
8154
8155PyObject *
8156PyUnicode_EncodeCodePage(int code_page,
8157 PyObject *unicode,
8158 const char *errors)
8159{
Victor Stinner7581cef2011-11-03 22:32:33 +01008160 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008161}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00008162
Alexander Belopolsky40018472011-02-26 01:02:56 +00008163PyObject *
8164PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008165{
Victor Stinner7581cef2011-11-03 22:32:33 +01008166 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008167}
8168
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008169#undef NEED_RETRY
8170
Steve Dowercc16be82016-09-08 10:35:16 -07008171#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008172
Guido van Rossumd57fd912000-03-10 22:53:23 +00008173/* --- Character Mapping Codec -------------------------------------------- */
8174
Victor Stinnerfb161b12013-04-18 01:44:27 +02008175static int
8176charmap_decode_string(const char *s,
8177 Py_ssize_t size,
8178 PyObject *mapping,
8179 const char *errors,
8180 _PyUnicodeWriter *writer)
8181{
8182 const char *starts = s;
8183 const char *e;
8184 Py_ssize_t startinpos, endinpos;
8185 PyObject *errorHandler = NULL, *exc = NULL;
8186 Py_ssize_t maplen;
8187 enum PyUnicode_Kind mapkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008188 const void *mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008189 Py_UCS4 x;
8190 unsigned char ch;
8191
8192 if (PyUnicode_READY(mapping) == -1)
8193 return -1;
8194
8195 maplen = PyUnicode_GET_LENGTH(mapping);
8196 mapdata = PyUnicode_DATA(mapping);
8197 mapkind = PyUnicode_KIND(mapping);
8198
8199 e = s + size;
8200
8201 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8202 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8203 * is disabled in encoding aliases, latin1 is preferred because
8204 * its implementation is faster. */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008205 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008206 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8207 Py_UCS4 maxchar = writer->maxchar;
8208
8209 assert (writer->kind == PyUnicode_1BYTE_KIND);
8210 while (s < e) {
8211 ch = *s;
8212 x = mapdata_ucs1[ch];
8213 if (x > maxchar) {
8214 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8215 goto onError;
8216 maxchar = writer->maxchar;
8217 outdata = (Py_UCS1 *)writer->data;
8218 }
8219 outdata[writer->pos] = x;
8220 writer->pos++;
8221 ++s;
8222 }
8223 return 0;
8224 }
8225
8226 while (s < e) {
8227 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8228 enum PyUnicode_Kind outkind = writer->kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008229 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008230 if (outkind == PyUnicode_1BYTE_KIND) {
8231 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8232 Py_UCS4 maxchar = writer->maxchar;
8233 while (s < e) {
8234 ch = *s;
8235 x = mapdata_ucs2[ch];
8236 if (x > maxchar)
8237 goto Error;
8238 outdata[writer->pos] = x;
8239 writer->pos++;
8240 ++s;
8241 }
8242 break;
8243 }
8244 else if (outkind == PyUnicode_2BYTE_KIND) {
8245 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8246 while (s < e) {
8247 ch = *s;
8248 x = mapdata_ucs2[ch];
8249 if (x == 0xFFFE)
8250 goto Error;
8251 outdata[writer->pos] = x;
8252 writer->pos++;
8253 ++s;
8254 }
8255 break;
8256 }
8257 }
8258 ch = *s;
8259
8260 if (ch < maplen)
8261 x = PyUnicode_READ(mapkind, mapdata, ch);
8262 else
8263 x = 0xfffe; /* invalid value */
8264Error:
8265 if (x == 0xfffe)
8266 {
8267 /* undefined mapping */
8268 startinpos = s-starts;
8269 endinpos = startinpos+1;
8270 if (unicode_decode_call_errorhandler_writer(
8271 errors, &errorHandler,
8272 "charmap", "character maps to <undefined>",
8273 &starts, &e, &startinpos, &endinpos, &exc, &s,
8274 writer)) {
8275 goto onError;
8276 }
8277 continue;
8278 }
8279
8280 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8281 goto onError;
8282 ++s;
8283 }
8284 Py_XDECREF(errorHandler);
8285 Py_XDECREF(exc);
8286 return 0;
8287
8288onError:
8289 Py_XDECREF(errorHandler);
8290 Py_XDECREF(exc);
8291 return -1;
8292}
8293
8294static int
8295charmap_decode_mapping(const char *s,
8296 Py_ssize_t size,
8297 PyObject *mapping,
8298 const char *errors,
8299 _PyUnicodeWriter *writer)
8300{
8301 const char *starts = s;
8302 const char *e;
8303 Py_ssize_t startinpos, endinpos;
8304 PyObject *errorHandler = NULL, *exc = NULL;
8305 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008306 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008307
8308 e = s + size;
8309
8310 while (s < e) {
8311 ch = *s;
8312
8313 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8314 key = PyLong_FromLong((long)ch);
8315 if (key == NULL)
8316 goto onError;
8317
8318 item = PyObject_GetItem(mapping, key);
8319 Py_DECREF(key);
8320 if (item == NULL) {
8321 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8322 /* No mapping found means: mapping is undefined. */
8323 PyErr_Clear();
8324 goto Undefined;
8325 } else
8326 goto onError;
8327 }
8328
8329 /* Apply mapping */
8330 if (item == Py_None)
8331 goto Undefined;
8332 if (PyLong_Check(item)) {
8333 long value = PyLong_AS_LONG(item);
8334 if (value == 0xFFFE)
8335 goto Undefined;
8336 if (value < 0 || value > MAX_UNICODE) {
8337 PyErr_Format(PyExc_TypeError,
Max Bernstein36353882020-10-17 13:38:21 -07008338 "character mapping must be in range(0x%x)",
Victor Stinnerfb161b12013-04-18 01:44:27 +02008339 (unsigned long)MAX_UNICODE + 1);
8340 goto onError;
8341 }
8342
8343 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8344 goto onError;
8345 }
8346 else if (PyUnicode_Check(item)) {
8347 if (PyUnicode_READY(item) == -1)
8348 goto onError;
8349 if (PyUnicode_GET_LENGTH(item) == 1) {
8350 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8351 if (value == 0xFFFE)
8352 goto Undefined;
8353 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8354 goto onError;
8355 }
8356 else {
8357 writer->overallocate = 1;
8358 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8359 goto onError;
8360 }
8361 }
8362 else {
8363 /* wrong return value */
8364 PyErr_SetString(PyExc_TypeError,
8365 "character mapping must return integer, None or str");
8366 goto onError;
8367 }
8368 Py_CLEAR(item);
8369 ++s;
8370 continue;
8371
8372Undefined:
8373 /* undefined mapping */
8374 Py_CLEAR(item);
8375 startinpos = s-starts;
8376 endinpos = startinpos+1;
8377 if (unicode_decode_call_errorhandler_writer(
8378 errors, &errorHandler,
8379 "charmap", "character maps to <undefined>",
8380 &starts, &e, &startinpos, &endinpos, &exc, &s,
8381 writer)) {
8382 goto onError;
8383 }
8384 }
8385 Py_XDECREF(errorHandler);
8386 Py_XDECREF(exc);
8387 return 0;
8388
8389onError:
8390 Py_XDECREF(item);
8391 Py_XDECREF(errorHandler);
8392 Py_XDECREF(exc);
8393 return -1;
8394}
8395
Alexander Belopolsky40018472011-02-26 01:02:56 +00008396PyObject *
8397PyUnicode_DecodeCharmap(const char *s,
8398 Py_ssize_t size,
8399 PyObject *mapping,
8400 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008401{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008402 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008403
Guido van Rossumd57fd912000-03-10 22:53:23 +00008404 /* Default to Latin-1 */
8405 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008406 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008407
Guido van Rossumd57fd912000-03-10 22:53:23 +00008408 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008409 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008410 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008411 writer.min_length = size;
8412 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008413 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008414
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008415 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008416 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8417 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008418 }
8419 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008420 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8421 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008422 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008423 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008424
Benjamin Peterson29060642009-01-31 22:14:21 +00008425 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008426 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008427 return NULL;
8428}
8429
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008430/* Charmap encoding: the lookup table */
8431
Alexander Belopolsky40018472011-02-26 01:02:56 +00008432struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008433 PyObject_HEAD
8434 unsigned char level1[32];
8435 int count2, count3;
8436 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008437};
8438
8439static PyObject*
8440encoding_map_size(PyObject *obj, PyObject* args)
8441{
8442 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008443 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008444 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008445}
8446
8447static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008448 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008449 PyDoc_STR("Return the size (in bytes) of this object") },
8450 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008451};
8452
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008453static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008454 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008455 "EncodingMap", /*tp_name*/
8456 sizeof(struct encoding_map), /*tp_basicsize*/
8457 0, /*tp_itemsize*/
8458 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008459 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008460 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 0, /*tp_getattr*/
8462 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008463 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008464 0, /*tp_repr*/
8465 0, /*tp_as_number*/
8466 0, /*tp_as_sequence*/
8467 0, /*tp_as_mapping*/
8468 0, /*tp_hash*/
8469 0, /*tp_call*/
8470 0, /*tp_str*/
8471 0, /*tp_getattro*/
8472 0, /*tp_setattro*/
8473 0, /*tp_as_buffer*/
8474 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8475 0, /*tp_doc*/
8476 0, /*tp_traverse*/
8477 0, /*tp_clear*/
8478 0, /*tp_richcompare*/
8479 0, /*tp_weaklistoffset*/
8480 0, /*tp_iter*/
8481 0, /*tp_iternext*/
8482 encoding_map_methods, /*tp_methods*/
8483 0, /*tp_members*/
8484 0, /*tp_getset*/
8485 0, /*tp_base*/
8486 0, /*tp_dict*/
8487 0, /*tp_descr_get*/
8488 0, /*tp_descr_set*/
8489 0, /*tp_dictoffset*/
8490 0, /*tp_init*/
8491 0, /*tp_alloc*/
8492 0, /*tp_new*/
8493 0, /*tp_free*/
8494 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008495};
8496
8497PyObject*
8498PyUnicode_BuildEncodingMap(PyObject* string)
8499{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008500 PyObject *result;
8501 struct encoding_map *mresult;
8502 int i;
8503 int need_dict = 0;
8504 unsigned char level1[32];
8505 unsigned char level2[512];
8506 unsigned char *mlevel1, *mlevel2, *mlevel3;
8507 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008508 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008509 const void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008510 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008511 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008512
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008513 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008514 PyErr_BadArgument();
8515 return NULL;
8516 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008517 kind = PyUnicode_KIND(string);
8518 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008519 length = PyUnicode_GET_LENGTH(string);
8520 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008521 memset(level1, 0xFF, sizeof level1);
8522 memset(level2, 0xFF, sizeof level2);
8523
8524 /* If there isn't a one-to-one mapping of NULL to \0,
8525 or if there are non-BMP characters, we need to use
8526 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008527 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008528 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008529 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008530 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008531 ch = PyUnicode_READ(kind, data, i);
8532 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008533 need_dict = 1;
8534 break;
8535 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008536 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008537 /* unmapped character */
8538 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008539 l1 = ch >> 11;
8540 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008541 if (level1[l1] == 0xFF)
8542 level1[l1] = count2++;
8543 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008544 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008545 }
8546
8547 if (count2 >= 0xFF || count3 >= 0xFF)
8548 need_dict = 1;
8549
8550 if (need_dict) {
8551 PyObject *result = PyDict_New();
8552 PyObject *key, *value;
8553 if (!result)
8554 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008555 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008556 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008557 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008558 if (!key || !value)
8559 goto failed1;
8560 if (PyDict_SetItem(result, key, value) == -1)
8561 goto failed1;
8562 Py_DECREF(key);
8563 Py_DECREF(value);
8564 }
8565 return result;
8566 failed1:
8567 Py_XDECREF(key);
8568 Py_XDECREF(value);
8569 Py_DECREF(result);
8570 return NULL;
8571 }
8572
8573 /* Create a three-level trie */
Victor Stinner32bd68c2020-12-01 10:37:39 +01008574 result = PyObject_Malloc(sizeof(struct encoding_map) +
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008575 16*count2 + 128*count3 - 1);
Victor Stinner04fc4f22020-06-16 01:28:07 +02008576 if (!result) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008577 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02008578 }
8579
8580 _PyObject_Init(result, &EncodingMapType);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008581 mresult = (struct encoding_map*)result;
8582 mresult->count2 = count2;
8583 mresult->count3 = count3;
8584 mlevel1 = mresult->level1;
8585 mlevel2 = mresult->level23;
8586 mlevel3 = mresult->level23 + 16*count2;
8587 memcpy(mlevel1, level1, 32);
8588 memset(mlevel2, 0xFF, 16*count2);
8589 memset(mlevel3, 0, 128*count3);
8590 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008591 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008592 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008593 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8594 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008595 /* unmapped character */
8596 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008597 o1 = ch>>11;
8598 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008599 i2 = 16*mlevel1[o1] + o2;
8600 if (mlevel2[i2] == 0xFF)
8601 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008602 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008603 i3 = 128*mlevel2[i2] + o3;
8604 mlevel3[i3] = i;
8605 }
8606 return result;
8607}
8608
8609static int
Victor Stinner22168992011-11-20 17:09:18 +01008610encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008611{
8612 struct encoding_map *map = (struct encoding_map*)mapping;
8613 int l1 = c>>11;
8614 int l2 = (c>>7) & 0xF;
8615 int l3 = c & 0x7F;
8616 int i;
8617
Victor Stinner22168992011-11-20 17:09:18 +01008618 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008619 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008620 if (c == 0)
8621 return 0;
8622 /* level 1*/
8623 i = map->level1[l1];
8624 if (i == 0xFF) {
8625 return -1;
8626 }
8627 /* level 2*/
8628 i = map->level23[16*i+l2];
8629 if (i == 0xFF) {
8630 return -1;
8631 }
8632 /* level 3 */
8633 i = map->level23[16*map->count2 + 128*i + l3];
8634 if (i == 0) {
8635 return -1;
8636 }
8637 return i;
8638}
8639
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008640/* Lookup the character ch in the mapping. If the character
8641 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008642 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008643static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008644charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008645{
Christian Heimes217cfd12007-12-02 14:31:20 +00008646 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008647 PyObject *x;
8648
8649 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008650 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008651 x = PyObject_GetItem(mapping, w);
8652 Py_DECREF(w);
8653 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008654 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8655 /* No mapping found means: mapping is undefined. */
8656 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008657 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008658 } else
8659 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008660 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008661 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008662 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008663 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008664 long value = PyLong_AS_LONG(x);
8665 if (value < 0 || value > 255) {
8666 PyErr_SetString(PyExc_TypeError,
8667 "character mapping must be in range(256)");
8668 Py_DECREF(x);
8669 return NULL;
8670 }
8671 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008672 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008673 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008674 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008676 /* wrong return value */
8677 PyErr_Format(PyExc_TypeError,
8678 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008679 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008680 Py_DECREF(x);
8681 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682 }
8683}
8684
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008685static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008686charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008687{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008688 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8689 /* exponentially overallocate to minimize reallocations */
8690 if (requiredsize < 2*outsize)
8691 requiredsize = 2*outsize;
8692 if (_PyBytes_Resize(outobj, requiredsize))
8693 return -1;
8694 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008695}
8696
Benjamin Peterson14339b62009-01-31 16:36:08 +00008697typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008699} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008700/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008701 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008702 space is available. Return a new reference to the object that
8703 was put in the output buffer, or Py_None, if the mapping was undefined
8704 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008705 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008706static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008707charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008708 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008709{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008710 PyObject *rep;
8711 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008712 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008713
Andy Lesterdffe4c02020-03-04 07:15:20 -06008714 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008715 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008716 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008717 if (res == -1)
8718 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008719 if (outsize<requiredsize)
8720 if (charmapencode_resize(outobj, outpos, requiredsize))
8721 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008722 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008723 outstart[(*outpos)++] = (char)res;
8724 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008725 }
8726
8727 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008728 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008730 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 Py_DECREF(rep);
8732 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008733 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 if (PyLong_Check(rep)) {
8735 Py_ssize_t requiredsize = *outpos+1;
8736 if (outsize<requiredsize)
8737 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8738 Py_DECREF(rep);
8739 return enc_EXCEPTION;
8740 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008741 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008742 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008743 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008744 else {
8745 const char *repchars = PyBytes_AS_STRING(rep);
8746 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8747 Py_ssize_t requiredsize = *outpos+repsize;
8748 if (outsize<requiredsize)
8749 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8750 Py_DECREF(rep);
8751 return enc_EXCEPTION;
8752 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008753 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008754 memcpy(outstart + *outpos, repchars, repsize);
8755 *outpos += repsize;
8756 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008757 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008758 Py_DECREF(rep);
8759 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008760}
8761
8762/* handle an error in PyUnicode_EncodeCharmap
8763 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008764static int
8765charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008766 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008767 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008768 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008769 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008770{
8771 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008772 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008773 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008774 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008775 const void *data;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008776 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008777 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008778 Py_ssize_t collstartpos = *inpos;
8779 Py_ssize_t collendpos = *inpos+1;
8780 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008781 const char *encoding = "charmap";
8782 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008783 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008784 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008785 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008786
Benjamin Petersonbac79492012-01-14 13:34:47 -05008787 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008788 return -1;
8789 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008790 /* find all unencodable characters */
8791 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008792 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008793 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008794 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008795 val = encoding_map_lookup(ch, mapping);
8796 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008797 break;
8798 ++collendpos;
8799 continue;
8800 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008801
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008802 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8803 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008804 if (rep==NULL)
8805 return -1;
8806 else if (rep!=Py_None) {
8807 Py_DECREF(rep);
8808 break;
8809 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008810 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008811 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008812 }
8813 /* cache callback name lookup
8814 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008815 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008816 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008817
8818 switch (*error_handler) {
8819 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008820 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008821 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008822
8823 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008824 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008825 x = charmapencode_output('?', mapping, res, respos);
8826 if (x==enc_EXCEPTION) {
8827 return -1;
8828 }
8829 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008830 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008831 return -1;
8832 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008833 }
8834 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008835 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008836 *inpos = collendpos;
8837 break;
Victor Stinner50149202015-09-22 00:26:54 +02008838
8839 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008840 /* generate replacement (temporarily (mis)uses p) */
8841 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008842 char buffer[2+29+1+1];
8843 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008844 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008845 for (cp = buffer; *cp; ++cp) {
8846 x = charmapencode_output(*cp, mapping, res, respos);
8847 if (x==enc_EXCEPTION)
8848 return -1;
8849 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008850 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008851 return -1;
8852 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008853 }
8854 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008855 *inpos = collendpos;
8856 break;
Victor Stinner50149202015-09-22 00:26:54 +02008857
Benjamin Peterson14339b62009-01-31 16:36:08 +00008858 default:
Victor Stinner50149202015-09-22 00:26:54 +02008859 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008860 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008861 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008862 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008863 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008864 if (PyBytes_Check(repunicode)) {
8865 /* Directly copy bytes result to output. */
8866 Py_ssize_t outsize = PyBytes_Size(*res);
8867 Py_ssize_t requiredsize;
8868 repsize = PyBytes_Size(repunicode);
8869 requiredsize = *respos + repsize;
8870 if (requiredsize > outsize)
8871 /* Make room for all additional bytes. */
8872 if (charmapencode_resize(res, respos, requiredsize)) {
8873 Py_DECREF(repunicode);
8874 return -1;
8875 }
8876 memcpy(PyBytes_AsString(*res) + *respos,
8877 PyBytes_AsString(repunicode), repsize);
8878 *respos += repsize;
8879 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008880 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008881 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008882 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008883 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008884 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008885 Py_DECREF(repunicode);
8886 return -1;
8887 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008888 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008889 data = PyUnicode_DATA(repunicode);
8890 kind = PyUnicode_KIND(repunicode);
8891 for (index = 0; index < repsize; index++) {
8892 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8893 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008894 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008895 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008896 return -1;
8897 }
8898 else if (x==enc_FAILED) {
8899 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008900 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008901 return -1;
8902 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008903 }
8904 *inpos = newpos;
8905 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008906 }
8907 return 0;
8908}
8909
Alexander Belopolsky40018472011-02-26 01:02:56 +00008910PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008911_PyUnicode_EncodeCharmap(PyObject *unicode,
8912 PyObject *mapping,
8913 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008914{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008915 /* output object */
8916 PyObject *res = NULL;
8917 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008918 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008919 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008920 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008921 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008922 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008923 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008924 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008925 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008926 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008927
Benjamin Petersonbac79492012-01-14 13:34:47 -05008928 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008929 return NULL;
8930 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008931 data = PyUnicode_DATA(unicode);
8932 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008933
Guido van Rossumd57fd912000-03-10 22:53:23 +00008934 /* Default to Latin-1 */
8935 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008936 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008937
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008938 /* allocate enough for a simple encoding without
8939 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008940 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008941 if (res == NULL)
8942 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008943 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008944 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008945
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008946 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008947 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008948 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008949 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008950 if (x==enc_EXCEPTION) /* error */
8951 goto onError;
8952 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008953 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008954 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008955 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008956 &res, &respos)) {
8957 goto onError;
8958 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008959 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008960 else
8961 /* done with this character => adjust input position */
8962 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008963 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008964
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008965 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008966 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008967 if (_PyBytes_Resize(&res, respos) < 0)
8968 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008969
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008970 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008971 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008972 return res;
8973
Benjamin Peterson29060642009-01-31 22:14:21 +00008974 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008975 Py_XDECREF(res);
8976 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008977 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008978 return NULL;
8979}
8980
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008981/* Deprecated */
8982PyObject *
8983PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8984 Py_ssize_t size,
8985 PyObject *mapping,
8986 const char *errors)
8987{
8988 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008989 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008990 if (unicode == NULL)
8991 return NULL;
8992 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8993 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008994 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008995}
8996
Alexander Belopolsky40018472011-02-26 01:02:56 +00008997PyObject *
8998PyUnicode_AsCharmapString(PyObject *unicode,
8999 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009000{
9001 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009002 PyErr_BadArgument();
9003 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009004 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009005 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009006}
9007
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009008/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009009static void
9010make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009011 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009012 Py_ssize_t startpos, Py_ssize_t endpos,
9013 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009014{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009015 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009016 *exceptionObject = _PyUnicodeTranslateError_Create(
9017 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009018 }
9019 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009020 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9021 goto onError;
9022 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9023 goto onError;
9024 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9025 goto onError;
9026 return;
9027 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02009028 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009029 }
9030}
9031
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009032/* error handling callback helper:
9033 build arguments, call the callback and check the arguments,
9034 put the result into newpos and return the replacement string, which
9035 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009036static PyObject *
9037unicode_translate_call_errorhandler(const char *errors,
9038 PyObject **errorHandler,
9039 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009040 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009041 Py_ssize_t startpos, Py_ssize_t endpos,
9042 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009043{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009044 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009045
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009046 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009047 PyObject *restuple;
9048 PyObject *resunicode;
9049
9050 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009051 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009052 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009053 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009054 }
9055
9056 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009057 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009058 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009059 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009060
Petr Viktorinffd97532020-02-11 17:46:57 +01009061 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009062 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009063 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009064 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009065 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00009066 Py_DECREF(restuple);
9067 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009068 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009069 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00009070 &resunicode, &i_newpos)) {
9071 Py_DECREF(restuple);
9072 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009073 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00009074 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009075 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009076 else
9077 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009078 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02009079 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00009080 Py_DECREF(restuple);
9081 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00009082 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009083 Py_INCREF(resunicode);
9084 Py_DECREF(restuple);
9085 return resunicode;
9086}
9087
9088/* Lookup the character ch in the mapping and put the result in result,
9089 which must be decrefed by the caller.
9090 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009091static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009092charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009093{
Christian Heimes217cfd12007-12-02 14:31:20 +00009094 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009095 PyObject *x;
9096
9097 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009098 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009099 x = PyObject_GetItem(mapping, w);
9100 Py_DECREF(w);
9101 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009102 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9103 /* No mapping found means: use 1:1 mapping. */
9104 PyErr_Clear();
9105 *result = NULL;
9106 return 0;
9107 } else
9108 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009109 }
9110 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009111 *result = x;
9112 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009113 }
Christian Heimes217cfd12007-12-02 14:31:20 +00009114 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009115 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009116 if (value < 0 || value > MAX_UNICODE) {
9117 PyErr_Format(PyExc_ValueError,
9118 "character mapping must be in range(0x%x)",
9119 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00009120 Py_DECREF(x);
9121 return -1;
9122 }
9123 *result = x;
9124 return 0;
9125 }
9126 else if (PyUnicode_Check(x)) {
9127 *result = x;
9128 return 0;
9129 }
9130 else {
9131 /* wrong return value */
9132 PyErr_SetString(PyExc_TypeError,
9133 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009134 Py_DECREF(x);
9135 return -1;
9136 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009137}
Victor Stinner1194ea02014-04-04 19:37:40 +02009138
9139/* lookup the character, write the result into the writer.
9140 Return 1 if the result was written into the writer, return 0 if the mapping
9141 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009142static int
Victor Stinner1194ea02014-04-04 19:37:40 +02009143charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9144 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009145{
Victor Stinner1194ea02014-04-04 19:37:40 +02009146 PyObject *item;
9147
9148 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00009149 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009150
9151 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009152 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02009153 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009154 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009155 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009156 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009157 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009158
9159 if (item == Py_None) {
9160 Py_DECREF(item);
9161 return 0;
9162 }
9163
9164 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02009165 long ch = (Py_UCS4)PyLong_AS_LONG(item);
9166 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9167 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009168 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9169 Py_DECREF(item);
9170 return -1;
9171 }
9172 Py_DECREF(item);
9173 return 1;
9174 }
9175
9176 if (!PyUnicode_Check(item)) {
9177 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00009178 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009179 }
9180
9181 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9182 Py_DECREF(item);
9183 return -1;
9184 }
9185
9186 Py_DECREF(item);
9187 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009188}
9189
Victor Stinner89a76ab2014-04-05 11:44:04 +02009190static int
9191unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9192 Py_UCS1 *translate)
9193{
Benjamin Peterson1365de72014-04-07 20:15:41 -04009194 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009195 int ret = 0;
9196
Victor Stinner89a76ab2014-04-05 11:44:04 +02009197 if (charmaptranslate_lookup(ch, mapping, &item)) {
9198 return -1;
9199 }
9200
9201 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009202 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02009203 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009204 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009205 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009206 /* not found => default to 1:1 mapping */
9207 translate[ch] = ch;
9208 return 1;
9209 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009210 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02009211 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009212 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9213 used it */
9214 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009215 /* invalid character or character outside ASCII:
9216 skip the fast translate */
9217 goto exit;
9218 }
9219 translate[ch] = (Py_UCS1)replace;
9220 }
9221 else if (PyUnicode_Check(item)) {
9222 Py_UCS4 replace;
9223
9224 if (PyUnicode_READY(item) == -1) {
9225 Py_DECREF(item);
9226 return -1;
9227 }
9228 if (PyUnicode_GET_LENGTH(item) != 1)
9229 goto exit;
9230
9231 replace = PyUnicode_READ_CHAR(item, 0);
9232 if (replace > 127)
9233 goto exit;
9234 translate[ch] = (Py_UCS1)replace;
9235 }
9236 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009237 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009238 goto exit;
9239 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009240 ret = 1;
9241
Benjamin Peterson1365de72014-04-07 20:15:41 -04009242 exit:
9243 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009244 return ret;
9245}
9246
9247/* Fast path for ascii => ascii translation. Return 1 if the whole string
9248 was translated into writer, return 0 if the input string was partially
9249 translated into writer, raise an exception and return -1 on error. */
9250static int
9251unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009252 _PyUnicodeWriter *writer, int ignore,
9253 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009254{
Victor Stinner872b2912014-04-05 14:27:07 +02009255 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009256 Py_ssize_t len;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009257 const Py_UCS1 *in, *end;
9258 Py_UCS1 *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009259 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009260
Victor Stinner89a76ab2014-04-05 11:44:04 +02009261 len = PyUnicode_GET_LENGTH(input);
9262
Victor Stinner872b2912014-04-05 14:27:07 +02009263 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009264
9265 in = PyUnicode_1BYTE_DATA(input);
9266 end = in + len;
9267
9268 assert(PyUnicode_IS_ASCII(writer->buffer));
9269 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9270 out = PyUnicode_1BYTE_DATA(writer->buffer);
9271
Victor Stinner872b2912014-04-05 14:27:07 +02009272 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009273 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009274 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009275 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009276 int translate = unicode_fast_translate_lookup(mapping, ch,
9277 ascii_table);
9278 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009279 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009280 if (translate == 0)
9281 goto exit;
9282 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009283 }
Victor Stinner872b2912014-04-05 14:27:07 +02009284 if (ch2 == 0xfe) {
9285 if (ignore)
9286 continue;
9287 goto exit;
9288 }
9289 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009290 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009291 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009292 }
Victor Stinner872b2912014-04-05 14:27:07 +02009293 res = 1;
9294
9295exit:
9296 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009297 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009298 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009299}
9300
Victor Stinner3222da22015-10-01 22:07:32 +02009301static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009302_PyUnicode_TranslateCharmap(PyObject *input,
9303 PyObject *mapping,
9304 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009305{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009306 /* input object */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009307 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009308 Py_ssize_t size, i;
9309 int kind;
9310 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009311 _PyUnicodeWriter writer;
9312 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009313 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009314 PyObject *errorHandler = NULL;
9315 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009316 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009317 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009318
Guido van Rossumd57fd912000-03-10 22:53:23 +00009319 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009320 PyErr_BadArgument();
9321 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009322 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009323
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009324 if (PyUnicode_READY(input) == -1)
9325 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009326 data = PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009327 kind = PyUnicode_KIND(input);
9328 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009329
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009330 if (size == 0)
9331 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009332
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009333 /* allocate enough for a simple 1:1 translation without
9334 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009335 _PyUnicodeWriter_Init(&writer);
9336 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009337 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009338
Victor Stinner872b2912014-04-05 14:27:07 +02009339 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9340
Victor Stinner33798672016-03-01 21:59:58 +01009341 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009342 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009343 if (PyUnicode_IS_ASCII(input)) {
9344 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9345 if (res < 0) {
9346 _PyUnicodeWriter_Dealloc(&writer);
9347 return NULL;
9348 }
9349 if (res == 1)
9350 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009351 }
Victor Stinner33798672016-03-01 21:59:58 +01009352 else {
9353 i = 0;
9354 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009355
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009356 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009357 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009358 int translate;
9359 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9360 Py_ssize_t newpos;
9361 /* startpos for collecting untranslatable chars */
9362 Py_ssize_t collstart;
9363 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009364 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009365
Victor Stinner1194ea02014-04-04 19:37:40 +02009366 ch = PyUnicode_READ(kind, data, i);
9367 translate = charmaptranslate_output(ch, mapping, &writer);
9368 if (translate < 0)
9369 goto onError;
9370
9371 if (translate != 0) {
9372 /* it worked => adjust input pointer */
9373 ++i;
9374 continue;
9375 }
9376
9377 /* untranslatable character */
9378 collstart = i;
9379 collend = i+1;
9380
9381 /* find all untranslatable characters */
9382 while (collend < size) {
9383 PyObject *x;
9384 ch = PyUnicode_READ(kind, data, collend);
9385 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009386 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009387 Py_XDECREF(x);
9388 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009389 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009390 ++collend;
9391 }
9392
9393 if (ignore) {
9394 i = collend;
9395 }
9396 else {
9397 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9398 reason, input, &exc,
9399 collstart, collend, &newpos);
9400 if (repunicode == NULL)
9401 goto onError;
9402 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009403 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009404 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009405 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009406 Py_DECREF(repunicode);
9407 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009408 }
9409 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009410 Py_XDECREF(exc);
9411 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009412 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009413
Benjamin Peterson29060642009-01-31 22:14:21 +00009414 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009415 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009416 Py_XDECREF(exc);
9417 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009418 return NULL;
9419}
9420
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009421/* Deprecated. Use PyUnicode_Translate instead. */
9422PyObject *
9423PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9424 Py_ssize_t size,
9425 PyObject *mapping,
9426 const char *errors)
9427{
Christian Heimes5f520f42012-09-11 14:03:25 +02009428 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009429 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009430 if (!unicode)
9431 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009432 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9433 Py_DECREF(unicode);
9434 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009435}
9436
Alexander Belopolsky40018472011-02-26 01:02:56 +00009437PyObject *
9438PyUnicode_Translate(PyObject *str,
9439 PyObject *mapping,
9440 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009441{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009442 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009443 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009444 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009445}
Tim Petersced69f82003-09-16 20:30:58 +00009446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009447PyObject *
9448_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9449{
9450 if (!PyUnicode_Check(unicode)) {
9451 PyErr_BadInternalCall();
9452 return NULL;
9453 }
9454 if (PyUnicode_READY(unicode) == -1)
9455 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009456 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009457 /* If the string is already ASCII, just return the same string */
9458 Py_INCREF(unicode);
9459 return unicode;
9460 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009461
9462 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9463 PyObject *result = PyUnicode_New(len, 127);
9464 if (result == NULL) {
9465 return NULL;
9466 }
9467
9468 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9469 int kind = PyUnicode_KIND(unicode);
9470 const void *data = PyUnicode_DATA(unicode);
9471 Py_ssize_t i;
9472 for (i = 0; i < len; ++i) {
9473 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9474 if (ch < 127) {
9475 out[i] = ch;
9476 }
9477 else if (Py_UNICODE_ISSPACE(ch)) {
9478 out[i] = ' ';
9479 }
9480 else {
9481 int decimal = Py_UNICODE_TODECIMAL(ch);
9482 if (decimal < 0) {
9483 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009484 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009485 _PyUnicode_LENGTH(result) = i + 1;
9486 break;
9487 }
9488 out[i] = '0' + decimal;
9489 }
9490 }
9491
INADA Naoki16dfca42018-07-14 12:06:43 +09009492 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009493 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009494}
9495
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009496PyObject *
9497PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9498 Py_ssize_t length)
9499{
Victor Stinnerf0124502011-11-21 23:12:56 +01009500 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009501 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009502 Py_UCS4 maxchar;
9503 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009504 const void *data;
Victor Stinnerf0124502011-11-21 23:12:56 +01009505
Victor Stinner99d7ad02012-02-22 13:37:39 +01009506 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009507 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009508 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009509 if (ch > 127) {
9510 int decimal = Py_UNICODE_TODECIMAL(ch);
9511 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009512 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009513 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009514 }
9515 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009516
9517 /* Copy to a new string */
9518 decimal = PyUnicode_New(length, maxchar);
9519 if (decimal == NULL)
9520 return decimal;
9521 kind = PyUnicode_KIND(decimal);
9522 data = PyUnicode_DATA(decimal);
9523 /* Iterate over code points */
9524 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009525 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009526 if (ch > 127) {
9527 int decimal = Py_UNICODE_TODECIMAL(ch);
9528 if (decimal >= 0)
9529 ch = '0' + decimal;
9530 }
9531 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009532 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009533 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009534}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009535/* --- Decimal Encoder ---------------------------------------------------- */
9536
Alexander Belopolsky40018472011-02-26 01:02:56 +00009537int
9538PyUnicode_EncodeDecimal(Py_UNICODE *s,
9539 Py_ssize_t length,
9540 char *output,
9541 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009542{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009543 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009544 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009545 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009546 const void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009547
9548 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009549 PyErr_BadArgument();
9550 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009551 }
9552
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009553 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009554 if (unicode == NULL)
9555 return -1;
9556
Victor Stinner42bf7752011-11-21 22:52:58 +01009557 kind = PyUnicode_KIND(unicode);
9558 data = PyUnicode_DATA(unicode);
9559
Victor Stinnerb84d7232011-11-22 01:50:07 +01009560 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009561 PyObject *exc;
9562 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009563 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009564 Py_ssize_t startpos;
9565
9566 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009567
Benjamin Peterson29060642009-01-31 22:14:21 +00009568 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009569 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009570 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009571 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009572 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009573 decimal = Py_UNICODE_TODECIMAL(ch);
9574 if (decimal >= 0) {
9575 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009576 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009577 continue;
9578 }
9579 if (0 < ch && ch < 256) {
9580 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009581 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009582 continue;
9583 }
Victor Stinner6345be92011-11-25 20:09:01 +01009584
Victor Stinner42bf7752011-11-21 22:52:58 +01009585 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009586 exc = NULL;
9587 raise_encode_exception(&exc, "decimal", unicode,
9588 startpos, startpos+1,
9589 "invalid decimal Unicode string");
9590 Py_XDECREF(exc);
9591 Py_DECREF(unicode);
9592 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009593 }
9594 /* 0-terminate the output string */
9595 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009596 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009597 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009598}
9599
Guido van Rossumd57fd912000-03-10 22:53:23 +00009600/* --- Helpers ------------------------------------------------------------ */
9601
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009602/* helper macro to fixup start/end slice values */
9603#define ADJUST_INDICES(start, end, len) \
9604 if (end > len) \
9605 end = len; \
9606 else if (end < 0) { \
9607 end += len; \
9608 if (end < 0) \
9609 end = 0; \
9610 } \
9611 if (start < 0) { \
9612 start += len; \
9613 if (start < 0) \
9614 start = 0; \
9615 }
9616
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009617static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009618any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009619 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009620 Py_ssize_t end,
9621 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009622{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009623 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009624 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009625 Py_ssize_t len1, len2, result;
9626
9627 kind1 = PyUnicode_KIND(s1);
9628 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009629 if (kind1 < kind2)
9630 return -1;
9631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009632 len1 = PyUnicode_GET_LENGTH(s1);
9633 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009634 ADJUST_INDICES(start, end, len1);
9635 if (end - start < len2)
9636 return -1;
9637
9638 buf1 = PyUnicode_DATA(s1);
9639 buf2 = PyUnicode_DATA(s2);
9640 if (len2 == 1) {
9641 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9642 result = findchar((const char *)buf1 + kind1*start,
9643 kind1, end - start, ch, direction);
9644 if (result == -1)
9645 return -1;
9646 else
9647 return start + result;
9648 }
9649
9650 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009651 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009652 if (!buf2)
9653 return -2;
9654 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009655
Victor Stinner794d5672011-10-10 03:21:36 +02009656 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009657 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009658 case PyUnicode_1BYTE_KIND:
9659 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9660 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9661 else
9662 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9663 break;
9664 case PyUnicode_2BYTE_KIND:
9665 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9666 break;
9667 case PyUnicode_4BYTE_KIND:
9668 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9669 break;
9670 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009671 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009672 }
9673 }
9674 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009675 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009676 case PyUnicode_1BYTE_KIND:
9677 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9678 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9679 else
9680 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9681 break;
9682 case PyUnicode_2BYTE_KIND:
9683 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9684 break;
9685 case PyUnicode_4BYTE_KIND:
9686 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9687 break;
9688 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009689 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009690 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009691 }
9692
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009693 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009694 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009695 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009696
9697 return result;
9698}
9699
Victor Stinner59423e32018-11-26 13:40:01 +01009700/* _PyUnicode_InsertThousandsGrouping() helper functions */
9701#include "stringlib/localeutil.h"
9702
9703/**
9704 * InsertThousandsGrouping:
9705 * @writer: Unicode writer.
9706 * @n_buffer: Number of characters in @buffer.
9707 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9708 * @d_pos: Start of digits string.
9709 * @n_digits: The number of digits in the string, in which we want
9710 * to put the grouping chars.
9711 * @min_width: The minimum width of the digits in the output string.
9712 * Output will be zero-padded on the left to fill.
9713 * @grouping: see definition in localeconv().
9714 * @thousands_sep: see definition in localeconv().
9715 *
9716 * There are 2 modes: counting and filling. If @writer is NULL,
9717 * we are in counting mode, else filling mode.
9718 * If counting, the required buffer size is returned.
9719 * If filling, we know the buffer will be large enough, so we don't
9720 * need to pass in the buffer size.
9721 * Inserts thousand grouping characters (as defined by grouping and
9722 * thousands_sep) into @writer.
9723 *
9724 * Return value: -1 on error, number of characters otherwise.
9725 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009726Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009727_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009728 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009729 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009730 PyObject *digits,
9731 Py_ssize_t d_pos,
9732 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009733 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009734 const char *grouping,
9735 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009736 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009737{
Xtreak3f7983a2019-01-07 20:39:14 +05309738 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009739 if (writer) {
9740 assert(digits != NULL);
9741 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009742 }
9743 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009744 assert(digits == NULL);
9745 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009746 }
Victor Stinner59423e32018-11-26 13:40:01 +01009747 assert(0 <= d_pos);
9748 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009749 assert(grouping != NULL);
9750
9751 if (digits != NULL) {
9752 if (PyUnicode_READY(digits) == -1) {
9753 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009754 }
Victor Stinner59423e32018-11-26 13:40:01 +01009755 }
9756 if (PyUnicode_READY(thousands_sep) == -1) {
9757 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009758 }
9759
Victor Stinner59423e32018-11-26 13:40:01 +01009760 Py_ssize_t count = 0;
9761 Py_ssize_t n_zeros;
9762 int loop_broken = 0;
9763 int use_separator = 0; /* First time through, don't append the
9764 separator. They only go between
9765 groups. */
9766 Py_ssize_t buffer_pos;
9767 Py_ssize_t digits_pos;
9768 Py_ssize_t len;
9769 Py_ssize_t n_chars;
9770 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9771 be looked at */
9772 /* A generator that returns all of the grouping widths, until it
9773 returns 0. */
9774 GroupGenerator groupgen;
9775 GroupGenerator_init(&groupgen, grouping);
9776 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9777
9778 /* if digits are not grouped, thousands separator
9779 should be an empty string */
9780 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9781
9782 digits_pos = d_pos + n_digits;
9783 if (writer) {
9784 buffer_pos = writer->pos + n_buffer;
9785 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9786 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009787 }
Victor Stinner59423e32018-11-26 13:40:01 +01009788 else {
9789 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009790 }
Victor Stinner59423e32018-11-26 13:40:01 +01009791
9792 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009793 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009794 }
Victor Stinner59423e32018-11-26 13:40:01 +01009795
9796 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9797 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9798 n_zeros = Py_MAX(0, len - remaining);
9799 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9800
9801 /* Use n_zero zero's and n_chars chars */
9802
9803 /* Count only, don't do anything. */
9804 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9805
9806 /* Copy into the writer. */
9807 InsertThousandsGrouping_fill(writer, &buffer_pos,
9808 digits, &digits_pos,
9809 n_chars, n_zeros,
9810 use_separator ? thousands_sep : NULL,
9811 thousands_sep_len, maxchar);
9812
9813 /* Use a separator next time. */
9814 use_separator = 1;
9815
9816 remaining -= n_chars;
9817 min_width -= len;
9818
9819 if (remaining <= 0 && min_width <= 0) {
9820 loop_broken = 1;
9821 break;
9822 }
9823 min_width -= thousands_sep_len;
9824 }
9825 if (!loop_broken) {
9826 /* We left the loop without using a break statement. */
9827
9828 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9829 n_zeros = Py_MAX(0, len - remaining);
9830 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9831
9832 /* Use n_zero zero's and n_chars chars */
9833 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9834
9835 /* Copy into the writer. */
9836 InsertThousandsGrouping_fill(writer, &buffer_pos,
9837 digits, &digits_pos,
9838 n_chars, n_zeros,
9839 use_separator ? thousands_sep : NULL,
9840 thousands_sep_len, maxchar);
9841 }
9842 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009843}
9844
9845
Alexander Belopolsky40018472011-02-26 01:02:56 +00009846Py_ssize_t
9847PyUnicode_Count(PyObject *str,
9848 PyObject *substr,
9849 Py_ssize_t start,
9850 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009851{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009852 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009853 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009854 const void *buf1 = NULL, *buf2 = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009855 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009856
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009857 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009858 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009859
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009860 kind1 = PyUnicode_KIND(str);
9861 kind2 = PyUnicode_KIND(substr);
9862 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009863 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009864
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009865 len1 = PyUnicode_GET_LENGTH(str);
9866 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009867 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009868 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009869 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009870
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009871 buf1 = PyUnicode_DATA(str);
9872 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009873 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009874 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009875 if (!buf2)
9876 goto onError;
9877 }
9878
9879 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009880 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009881 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009882 result = asciilib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009883 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009884 buf2, len2, PY_SSIZE_T_MAX
9885 );
9886 else
9887 result = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009888 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009889 buf2, len2, PY_SSIZE_T_MAX
9890 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009891 break;
9892 case PyUnicode_2BYTE_KIND:
9893 result = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009894 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009895 buf2, len2, PY_SSIZE_T_MAX
9896 );
9897 break;
9898 case PyUnicode_4BYTE_KIND:
9899 result = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009900 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009901 buf2, len2, PY_SSIZE_T_MAX
9902 );
9903 break;
9904 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009905 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009906 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009907
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009908 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009909 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009910 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009911
Guido van Rossumd57fd912000-03-10 22:53:23 +00009912 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009913 onError:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009914 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9915 if (kind2 != kind1)
9916 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009917 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009918}
9919
Alexander Belopolsky40018472011-02-26 01:02:56 +00009920Py_ssize_t
9921PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009922 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009923 Py_ssize_t start,
9924 Py_ssize_t end,
9925 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009926{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009927 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009928 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009929
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009930 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009931}
9932
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009933Py_ssize_t
9934PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9935 Py_ssize_t start, Py_ssize_t end,
9936 int direction)
9937{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009938 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009939 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009940 if (PyUnicode_READY(str) == -1)
9941 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009942 len = PyUnicode_GET_LENGTH(str);
9943 ADJUST_INDICES(start, end, len);
9944 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009945 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009946 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009947 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9948 kind, end-start, ch, direction);
9949 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009950 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009951 else
9952 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009953}
9954
Alexander Belopolsky40018472011-02-26 01:02:56 +00009955static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009956tailmatch(PyObject *self,
9957 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009958 Py_ssize_t start,
9959 Py_ssize_t end,
9960 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009961{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009962 int kind_self;
9963 int kind_sub;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009964 const void *data_self;
9965 const void *data_sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009966 Py_ssize_t offset;
9967 Py_ssize_t i;
9968 Py_ssize_t end_sub;
9969
9970 if (PyUnicode_READY(self) == -1 ||
9971 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009972 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009973
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009974 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9975 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009976 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009977 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009978
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009979 if (PyUnicode_GET_LENGTH(substring) == 0)
9980 return 1;
9981
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009982 kind_self = PyUnicode_KIND(self);
9983 data_self = PyUnicode_DATA(self);
9984 kind_sub = PyUnicode_KIND(substring);
9985 data_sub = PyUnicode_DATA(substring);
9986 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9987
9988 if (direction > 0)
9989 offset = end;
9990 else
9991 offset = start;
9992
9993 if (PyUnicode_READ(kind_self, data_self, offset) ==
9994 PyUnicode_READ(kind_sub, data_sub, 0) &&
9995 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9996 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9997 /* If both are of the same kind, memcmp is sufficient */
9998 if (kind_self == kind_sub) {
9999 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010000 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010001 data_sub,
10002 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010003 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010004 }
Martin Pantere26da7c2016-06-02 10:07:09 +000010005 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010006 else {
10007 /* We do not need to compare 0 and len(substring)-1 because
10008 the if statement above ensured already that they are equal
10009 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010010 for (i = 1; i < end_sub; ++i) {
10011 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
10012 PyUnicode_READ(kind_sub, data_sub, i))
10013 return 0;
10014 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010015 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010016 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010017 }
10018
10019 return 0;
10020}
10021
Alexander Belopolsky40018472011-02-26 01:02:56 +000010022Py_ssize_t
10023PyUnicode_Tailmatch(PyObject *str,
10024 PyObject *substr,
10025 Py_ssize_t start,
10026 Py_ssize_t end,
10027 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010028{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010029 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010030 return -1;
Tim Petersced69f82003-09-16 20:30:58 +000010031
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010032 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010033}
10034
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010035static PyObject *
10036ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010037{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010038 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010039 const char *data = PyUnicode_DATA(self);
10040 char *resdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010041 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +000010042
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010043 res = PyUnicode_New(len, 127);
10044 if (res == NULL)
10045 return NULL;
10046 resdata = PyUnicode_DATA(res);
10047 if (lower)
10048 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010049 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010050 _Py_bytes_upper(resdata, data, len);
10051 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010052}
10053
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010054static Py_UCS4
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010055handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010056{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010057 Py_ssize_t j;
10058 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010010059 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010060 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +000010061
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010062 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
10063
10064 where ! is a negation and \p{xxx} is a character with property xxx.
10065 */
10066 for (j = i - 1; j >= 0; j--) {
10067 c = PyUnicode_READ(kind, data, j);
10068 if (!_PyUnicode_IsCaseIgnorable(c))
10069 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010070 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010071 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
10072 if (final_sigma) {
10073 for (j = i + 1; j < length; j++) {
10074 c = PyUnicode_READ(kind, data, j);
10075 if (!_PyUnicode_IsCaseIgnorable(c))
10076 break;
10077 }
10078 final_sigma = j == length || !_PyUnicode_IsCased(c);
10079 }
10080 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010081}
10082
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010083static int
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010084lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010085 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010086{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010087 /* Obscure special case. */
10088 if (c == 0x3A3) {
10089 mapped[0] = handle_capital_sigma(kind, data, length, i);
10090 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010091 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010092 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010093}
10094
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010095static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010096do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010097{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010098 Py_ssize_t i, k = 0;
10099 int n_res, j;
10100 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +000010101
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010102 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +010010103 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010104 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010105 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010106 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +000010107 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010108 for (i = 1; i < length; i++) {
10109 c = PyUnicode_READ(kind, data, i);
10110 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10111 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010112 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010113 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000010114 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000010115 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010116 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010117}
10118
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010119static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010120do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010121 Py_ssize_t i, k = 0;
10122
10123 for (i = 0; i < length; i++) {
10124 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10125 int n_res, j;
10126 if (Py_UNICODE_ISUPPER(c)) {
10127 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10128 }
10129 else if (Py_UNICODE_ISLOWER(c)) {
10130 n_res = _PyUnicode_ToUpperFull(c, mapped);
10131 }
10132 else {
10133 n_res = 1;
10134 mapped[0] = c;
10135 }
10136 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010137 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010138 res[k++] = mapped[j];
10139 }
10140 }
10141 return k;
10142}
10143
10144static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010145do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010146 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010147{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010148 Py_ssize_t i, k = 0;
10149
10150 for (i = 0; i < length; i++) {
10151 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10152 int n_res, j;
10153 if (lower)
10154 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10155 else
10156 n_res = _PyUnicode_ToUpperFull(c, mapped);
10157 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010158 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010159 res[k++] = mapped[j];
10160 }
10161 }
10162 return k;
10163}
10164
10165static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010166do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010167{
10168 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
10169}
10170
10171static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010172do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010173{
10174 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
10175}
10176
Benjamin Petersone51757f2012-01-12 21:10:29 -050010177static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010178do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersond5890c82012-01-14 13:23:30 -050010179{
10180 Py_ssize_t i, k = 0;
10181
10182 for (i = 0; i < length; i++) {
10183 Py_UCS4 c = PyUnicode_READ(kind, data, i);
10184 Py_UCS4 mapped[3];
10185 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10186 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010187 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010188 res[k++] = mapped[j];
10189 }
10190 }
10191 return k;
10192}
10193
10194static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010195do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersone51757f2012-01-12 21:10:29 -050010196{
10197 Py_ssize_t i, k = 0;
10198 int previous_is_cased;
10199
10200 previous_is_cased = 0;
10201 for (i = 0; i < length; i++) {
10202 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10203 Py_UCS4 mapped[3];
10204 int n_res, j;
10205
10206 if (previous_is_cased)
10207 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10208 else
10209 n_res = _PyUnicode_ToTitleFull(c, mapped);
10210
10211 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010212 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -050010213 res[k++] = mapped[j];
10214 }
10215
10216 previous_is_cased = _PyUnicode_IsCased(c);
10217 }
10218 return k;
10219}
10220
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010221static PyObject *
10222case_operation(PyObject *self,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010223 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010224{
10225 PyObject *res = NULL;
10226 Py_ssize_t length, newlength = 0;
10227 int kind, outkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010228 const void *data;
10229 void *outdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010230 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10231
Benjamin Petersoneea48462012-01-16 14:28:50 -050010232 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010233
10234 kind = PyUnicode_KIND(self);
10235 data = PyUnicode_DATA(self);
10236 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010237 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010238 PyErr_SetString(PyExc_OverflowError, "string is too long");
10239 return NULL;
10240 }
Victor Stinner00d7abd2020-12-01 09:56:42 +010010241 tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010242 if (tmp == NULL)
10243 return PyErr_NoMemory();
10244 newlength = perform(kind, data, length, tmp, &maxchar);
10245 res = PyUnicode_New(newlength, maxchar);
10246 if (res == NULL)
10247 goto leave;
10248 tmpend = tmp + newlength;
10249 outdata = PyUnicode_DATA(res);
10250 outkind = PyUnicode_KIND(res);
10251 switch (outkind) {
10252 case PyUnicode_1BYTE_KIND:
10253 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10254 break;
10255 case PyUnicode_2BYTE_KIND:
10256 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10257 break;
10258 case PyUnicode_4BYTE_KIND:
10259 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10260 break;
10261 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010262 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010263 }
10264 leave:
Victor Stinner00d7abd2020-12-01 09:56:42 +010010265 PyMem_Free(tmp);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010266 return res;
10267}
10268
Tim Peters8ce9f162004-08-27 01:49:32 +000010269PyObject *
10270PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010271{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010272 PyObject *res;
10273 PyObject *fseq;
10274 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010275 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010276
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010277 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010278 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010279 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010280 }
10281
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010282 /* NOTE: the following code can't call back into Python code,
10283 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010284 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010285
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010286 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010287 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010288 res = _PyUnicode_JoinArray(separator, items, seqlen);
10289 Py_DECREF(fseq);
10290 return res;
10291}
10292
10293PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010294_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010295{
10296 PyObject *res = NULL; /* the result */
10297 PyObject *sep = NULL;
10298 Py_ssize_t seplen;
10299 PyObject *item;
10300 Py_ssize_t sz, i, res_offset;
10301 Py_UCS4 maxchar;
10302 Py_UCS4 item_maxchar;
10303 int use_memcpy;
10304 unsigned char *res_data = NULL, *sep_data = NULL;
10305 PyObject *last_obj;
10306 unsigned int kind = 0;
10307
Tim Peters05eba1f2004-08-27 21:32:02 +000010308 /* If empty sequence, return u"". */
10309 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010310 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010311 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010312
Tim Peters05eba1f2004-08-27 21:32:02 +000010313 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010314 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010315 if (seqlen == 1) {
10316 if (PyUnicode_CheckExact(items[0])) {
10317 res = items[0];
10318 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010319 return res;
10320 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010321 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010322 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010323 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010324 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010325 /* Set up sep and seplen */
10326 if (separator == NULL) {
10327 /* fall back to a blank space separator */
10328 sep = PyUnicode_FromOrdinal(' ');
10329 if (!sep)
10330 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010331 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010332 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010333 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010334 else {
10335 if (!PyUnicode_Check(separator)) {
10336 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010337 "separator: expected str instance,"
10338 " %.80s found",
10339 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010340 goto onError;
10341 }
10342 if (PyUnicode_READY(separator))
10343 goto onError;
10344 sep = separator;
10345 seplen = PyUnicode_GET_LENGTH(separator);
10346 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10347 /* inc refcount to keep this code path symmetric with the
10348 above case of a blank separator */
10349 Py_INCREF(sep);
10350 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010351 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010352 }
10353
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010354 /* There are at least two things to join, or else we have a subclass
10355 * of str in the sequence.
10356 * Do a pre-pass to figure out the total amount of space we'll
10357 * need (sz), and see whether all argument are strings.
10358 */
10359 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010360#ifdef Py_DEBUG
10361 use_memcpy = 0;
10362#else
10363 use_memcpy = 1;
10364#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010365 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010366 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010367 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010368 if (!PyUnicode_Check(item)) {
10369 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010370 "sequence item %zd: expected str instance,"
10371 " %.80s found",
10372 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010373 goto onError;
10374 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 if (PyUnicode_READY(item) == -1)
10376 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010377 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010378 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010379 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010380 if (i != 0) {
10381 add_sz += seplen;
10382 }
10383 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010384 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010385 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010386 goto onError;
10387 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010388 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010389 if (use_memcpy && last_obj != NULL) {
10390 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10391 use_memcpy = 0;
10392 }
10393 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010394 }
Tim Petersced69f82003-09-16 20:30:58 +000010395
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010397 if (res == NULL)
10398 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010399
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010400 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010401#ifdef Py_DEBUG
10402 use_memcpy = 0;
10403#else
10404 if (use_memcpy) {
10405 res_data = PyUnicode_1BYTE_DATA(res);
10406 kind = PyUnicode_KIND(res);
10407 if (seplen != 0)
10408 sep_data = PyUnicode_1BYTE_DATA(sep);
10409 }
10410#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010411 if (use_memcpy) {
10412 for (i = 0; i < seqlen; ++i) {
10413 Py_ssize_t itemlen;
10414 item = items[i];
10415
10416 /* Copy item, and maybe the separator. */
10417 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010418 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010419 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010420 kind * seplen);
10421 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010422 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010423
10424 itemlen = PyUnicode_GET_LENGTH(item);
10425 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010426 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010427 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010428 kind * itemlen);
10429 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010430 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010431 }
10432 assert(res_data == PyUnicode_1BYTE_DATA(res)
10433 + kind * PyUnicode_GET_LENGTH(res));
10434 }
10435 else {
10436 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10437 Py_ssize_t itemlen;
10438 item = items[i];
10439
10440 /* Copy item, and maybe the separator. */
10441 if (i && seplen != 0) {
10442 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10443 res_offset += seplen;
10444 }
10445
10446 itemlen = PyUnicode_GET_LENGTH(item);
10447 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010448 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010449 res_offset += itemlen;
10450 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010451 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010452 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010453 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010455 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010456 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010457 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010458
Benjamin Peterson29060642009-01-31 22:14:21 +000010459 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010460 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010461 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010462 return NULL;
10463}
10464
Victor Stinnerd3f08822012-05-29 12:57:52 +020010465void
10466_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10467 Py_UCS4 fill_char)
10468{
10469 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010470 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010471 assert(PyUnicode_IS_READY(unicode));
10472 assert(unicode_modifiable(unicode));
10473 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10474 assert(start >= 0);
10475 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010476 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010477}
10478
Victor Stinner3fe55312012-01-04 00:33:50 +010010479Py_ssize_t
10480PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10481 Py_UCS4 fill_char)
10482{
10483 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010484
10485 if (!PyUnicode_Check(unicode)) {
10486 PyErr_BadInternalCall();
10487 return -1;
10488 }
10489 if (PyUnicode_READY(unicode) == -1)
10490 return -1;
10491 if (unicode_check_modifiable(unicode))
10492 return -1;
10493
Victor Stinnerd3f08822012-05-29 12:57:52 +020010494 if (start < 0) {
10495 PyErr_SetString(PyExc_IndexError, "string index out of range");
10496 return -1;
10497 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010498 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10499 PyErr_SetString(PyExc_ValueError,
10500 "fill character is bigger than "
10501 "the string maximum character");
10502 return -1;
10503 }
10504
10505 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10506 length = Py_MIN(maxlen, length);
10507 if (length <= 0)
10508 return 0;
10509
Victor Stinnerd3f08822012-05-29 12:57:52 +020010510 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010511 return length;
10512}
10513
Victor Stinner9310abb2011-10-05 00:59:23 +020010514static PyObject *
10515pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010516 Py_ssize_t left,
10517 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010519{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010520 PyObject *u;
10521 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010522 int kind;
10523 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010524
10525 if (left < 0)
10526 left = 0;
10527 if (right < 0)
10528 right = 0;
10529
Victor Stinnerc4b49542011-12-11 22:44:26 +010010530 if (left == 0 && right == 0)
10531 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10534 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010535 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10536 return NULL;
10537 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010538 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010539 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010541 if (!u)
10542 return NULL;
10543
10544 kind = PyUnicode_KIND(u);
10545 data = PyUnicode_DATA(u);
10546 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010547 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010548 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010549 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010550 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010551 assert(_PyUnicode_CheckConsistency(u, 1));
10552 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010553}
10554
Alexander Belopolsky40018472011-02-26 01:02:56 +000010555PyObject *
10556PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010557{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010558 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010559
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010560 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010561 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010562
Benjamin Petersonead6b532011-12-20 17:23:42 -060010563 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010564 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010565 if (PyUnicode_IS_ASCII(string))
10566 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010567 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010568 PyUnicode_GET_LENGTH(string), keepends);
10569 else
10570 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010571 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010572 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 break;
10574 case PyUnicode_2BYTE_KIND:
10575 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010576 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010577 PyUnicode_GET_LENGTH(string), keepends);
10578 break;
10579 case PyUnicode_4BYTE_KIND:
10580 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010581 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 PyUnicode_GET_LENGTH(string), keepends);
10583 break;
10584 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010585 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010586 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010587 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010588}
10589
Alexander Belopolsky40018472011-02-26 01:02:56 +000010590static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010591split(PyObject *self,
10592 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010593 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010594{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010595 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010596 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 Py_ssize_t len1, len2;
10598 PyObject* out;
10599
Guido van Rossumd57fd912000-03-10 22:53:23 +000010600 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010601 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010602
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 if (PyUnicode_READY(self) == -1)
10604 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010605
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010607 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010609 if (PyUnicode_IS_ASCII(self))
10610 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010611 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010612 PyUnicode_GET_LENGTH(self), maxcount
10613 );
10614 else
10615 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010616 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010617 PyUnicode_GET_LENGTH(self), maxcount
10618 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 case PyUnicode_2BYTE_KIND:
10620 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010621 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 PyUnicode_GET_LENGTH(self), maxcount
10623 );
10624 case PyUnicode_4BYTE_KIND:
10625 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010626 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 PyUnicode_GET_LENGTH(self), maxcount
10628 );
10629 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010630 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010631 }
10632
10633 if (PyUnicode_READY(substring) == -1)
10634 return NULL;
10635
10636 kind1 = PyUnicode_KIND(self);
10637 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638 len1 = PyUnicode_GET_LENGTH(self);
10639 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010640 if (kind1 < kind2 || len1 < len2) {
10641 out = PyList_New(1);
10642 if (out == NULL)
10643 return NULL;
10644 Py_INCREF(self);
10645 PyList_SET_ITEM(out, 0, self);
10646 return out;
10647 }
10648 buf1 = PyUnicode_DATA(self);
10649 buf2 = PyUnicode_DATA(substring);
10650 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010651 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010652 if (!buf2)
10653 return NULL;
10654 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010656 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010658 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10659 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010660 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010661 else
10662 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010663 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 break;
10665 case PyUnicode_2BYTE_KIND:
10666 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010667 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010668 break;
10669 case PyUnicode_4BYTE_KIND:
10670 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010671 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 break;
10673 default:
10674 out = NULL;
10675 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010676 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010677 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010678 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010679 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010680}
10681
Alexander Belopolsky40018472011-02-26 01:02:56 +000010682static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010683rsplit(PyObject *self,
10684 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010685 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010686{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010687 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010688 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010689 Py_ssize_t len1, len2;
10690 PyObject* out;
10691
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010692 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010693 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010694
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010695 if (PyUnicode_READY(self) == -1)
10696 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010697
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010698 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010699 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010700 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010701 if (PyUnicode_IS_ASCII(self))
10702 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010703 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010704 PyUnicode_GET_LENGTH(self), maxcount
10705 );
10706 else
10707 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010708 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010709 PyUnicode_GET_LENGTH(self), maxcount
10710 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010711 case PyUnicode_2BYTE_KIND:
10712 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010713 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010714 PyUnicode_GET_LENGTH(self), maxcount
10715 );
10716 case PyUnicode_4BYTE_KIND:
10717 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010718 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 PyUnicode_GET_LENGTH(self), maxcount
10720 );
10721 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010722 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010723 }
10724
10725 if (PyUnicode_READY(substring) == -1)
10726 return NULL;
10727
10728 kind1 = PyUnicode_KIND(self);
10729 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010730 len1 = PyUnicode_GET_LENGTH(self);
10731 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010732 if (kind1 < kind2 || len1 < len2) {
10733 out = PyList_New(1);
10734 if (out == NULL)
10735 return NULL;
10736 Py_INCREF(self);
10737 PyList_SET_ITEM(out, 0, self);
10738 return out;
10739 }
10740 buf1 = PyUnicode_DATA(self);
10741 buf2 = PyUnicode_DATA(substring);
10742 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010743 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010744 if (!buf2)
10745 return NULL;
10746 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010747
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010748 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010749 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010750 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10751 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010752 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010753 else
10754 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010755 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010756 break;
10757 case PyUnicode_2BYTE_KIND:
10758 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010759 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010760 break;
10761 case PyUnicode_4BYTE_KIND:
10762 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010763 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010764 break;
10765 default:
10766 out = NULL;
10767 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010768 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010769 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010770 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010771 return out;
10772}
10773
10774static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010775anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10776 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010777{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010778 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010779 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010780 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10781 return asciilib_find(buf1, len1, buf2, len2, offset);
10782 else
10783 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010784 case PyUnicode_2BYTE_KIND:
10785 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10786 case PyUnicode_4BYTE_KIND:
10787 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10788 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010789 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010790}
10791
10792static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010793anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10794 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010795{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010796 switch (kind) {
10797 case PyUnicode_1BYTE_KIND:
10798 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10799 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10800 else
10801 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10802 case PyUnicode_2BYTE_KIND:
10803 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10804 case PyUnicode_4BYTE_KIND:
10805 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10806 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010807 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010808}
10809
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010810static void
10811replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10812 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10813{
10814 int kind = PyUnicode_KIND(u);
10815 void *data = PyUnicode_DATA(u);
10816 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10817 if (kind == PyUnicode_1BYTE_KIND) {
10818 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10819 (Py_UCS1 *)data + len,
10820 u1, u2, maxcount);
10821 }
10822 else if (kind == PyUnicode_2BYTE_KIND) {
10823 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10824 (Py_UCS2 *)data + len,
10825 u1, u2, maxcount);
10826 }
10827 else {
10828 assert(kind == PyUnicode_4BYTE_KIND);
10829 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10830 (Py_UCS4 *)data + len,
10831 u1, u2, maxcount);
10832 }
10833}
10834
Alexander Belopolsky40018472011-02-26 01:02:56 +000010835static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010836replace(PyObject *self, PyObject *str1,
10837 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010838{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010839 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010840 const char *sbuf = PyUnicode_DATA(self);
10841 const void *buf1 = PyUnicode_DATA(str1);
10842 const void *buf2 = PyUnicode_DATA(str2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843 int srelease = 0, release1 = 0, release2 = 0;
10844 int skind = PyUnicode_KIND(self);
10845 int kind1 = PyUnicode_KIND(str1);
10846 int kind2 = PyUnicode_KIND(str2);
10847 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10848 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10849 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010850 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010851 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010852
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010853 if (slen < len1)
10854 goto nothing;
10855
Guido van Rossumd57fd912000-03-10 22:53:23 +000010856 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010857 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010858 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010859 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010860
Victor Stinner59de0ee2011-10-07 10:01:28 +020010861 if (str1 == str2)
10862 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010863
Victor Stinner49a0a212011-10-12 23:46:10 +020010864 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010865 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10866 if (maxchar < maxchar_str1)
10867 /* substring too wide to be present */
10868 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010869 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10870 /* Replacing str1 with str2 may cause a maxchar reduction in the
10871 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010872 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010873 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010875 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010876 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010877 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010878 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010879 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010880 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010881 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010882 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010883
Victor Stinner69ed0f42013-04-09 21:48:24 +020010884 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010885 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010886 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010887 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010888 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010889 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010890 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010891 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010892
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010893 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10894 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010895 }
10896 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010897 int rkind = skind;
10898 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010899 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010900
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010901 if (kind1 < rkind) {
10902 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010903 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010904 if (!buf1) goto error;
10905 release1 = 1;
10906 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010907 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010908 if (i < 0)
10909 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010910 if (rkind > kind2) {
10911 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010912 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010913 if (!buf2) goto error;
10914 release2 = 1;
10915 }
10916 else if (rkind < kind2) {
10917 /* widen self and buf1 */
10918 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010919 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010920 assert(buf1 != PyUnicode_DATA(str1));
10921 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010922 buf1 = PyUnicode_DATA(str1);
10923 release1 = 0;
10924 }
10925 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010926 if (!sbuf) goto error;
10927 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010928 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010929 if (!buf1) goto error;
10930 release1 = 1;
10931 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010932 u = PyUnicode_New(slen, maxchar);
10933 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010934 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010935 assert(PyUnicode_KIND(u) == rkind);
10936 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010937
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010938 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010939 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010940 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010941 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010942 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010943 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010944
10945 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010946 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010947 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010948 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010949 if (i == -1)
10950 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010951 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010952 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010953 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010954 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010955 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010956 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010957 }
10958 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010959 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010960 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010961 int rkind = skind;
10962 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010963
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010964 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010965 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010966 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010967 if (!buf1) goto error;
10968 release1 = 1;
10969 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010970 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010971 if (n == 0)
10972 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010973 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010974 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010975 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010976 if (!buf2) goto error;
10977 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010978 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010979 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010980 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010981 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010982 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010983 if (!sbuf) goto error;
10984 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010985 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010986 assert(buf1 != PyUnicode_DATA(str1));
10987 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010988 buf1 = PyUnicode_DATA(str1);
10989 release1 = 0;
10990 }
10991 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010992 if (!buf1) goto error;
10993 release1 = 1;
10994 }
10995 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10996 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010997 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010998 PyErr_SetString(PyExc_OverflowError,
10999 "replace string is too long");
11000 goto error;
11001 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010011002 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020011003 if (new_size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +020011004 u = unicode_new_empty();
Victor Stinner49a0a212011-10-12 23:46:10 +020011005 goto done;
11006 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080011007 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011008 PyErr_SetString(PyExc_OverflowError,
11009 "replace string is too long");
11010 goto error;
11011 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011012 u = PyUnicode_New(new_size, maxchar);
11013 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011014 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020011015 assert(PyUnicode_KIND(u) == rkind);
11016 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011017 ires = i = 0;
11018 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011019 while (n-- > 0) {
11020 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020011021 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011022 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020011023 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000011024 if (j == -1)
11025 break;
11026 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011027 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011028 memcpy(res + rkind * ires,
11029 sbuf + rkind * i,
11030 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011031 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011032 }
11033 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011034 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011035 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011036 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011037 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011038 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011039 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011040 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011041 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011042 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011043 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011044 memcpy(res + rkind * ires,
11045 sbuf + rkind * i,
11046 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020011047 }
11048 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011049 /* interleave */
11050 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011051 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011052 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011053 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011054 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011055 if (--n <= 0)
11056 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011057 memcpy(res + rkind * ires,
11058 sbuf + rkind * i,
11059 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011060 ires++;
11061 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011062 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011063 memcpy(res + rkind * ires,
11064 sbuf + rkind * i,
11065 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011066 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011067 }
11068
11069 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020011070 unicode_adjust_maxchar(&u);
11071 if (u == NULL)
11072 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011073 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011074
11075 done:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011076 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11077 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11078 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011079 if (srelease)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011080 PyMem_Free((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011081 if (release1)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011082 PyMem_Free((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011083 if (release2)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011084 PyMem_Free((void *)buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011085 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011086 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011087
Benjamin Peterson29060642009-01-31 22:14:21 +000011088 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000011089 /* nothing to replace; return original string (when possible) */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011090 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11091 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11092 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011093 if (srelease)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011094 PyMem_Free((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011095 if (release1)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011096 PyMem_Free((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011097 if (release2)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011098 PyMem_Free((void *)buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010011099 return unicode_result_unchanged(self);
11100
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011101 error:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011102 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11103 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11104 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11105 if (srelease)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011106 PyMem_Free((void *)sbuf);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011107 if (release1)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011108 PyMem_Free((void *)buf1);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011109 if (release2)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011110 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011111 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011112}
11113
11114/* --- Unicode Object Methods --------------------------------------------- */
11115
INADA Naoki3ae20562017-01-16 20:41:20 +090011116/*[clinic input]
11117str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000011118
INADA Naoki3ae20562017-01-16 20:41:20 +090011119Return a version of the string where each word is titlecased.
11120
11121More specifically, words start with uppercased characters and all remaining
11122cased characters have lower case.
11123[clinic start generated code]*/
11124
11125static PyObject *
11126unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011127/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011128{
Benjamin Petersoneea48462012-01-16 14:28:50 -050011129 if (PyUnicode_READY(self) == -1)
11130 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011131 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011132}
11133
INADA Naoki3ae20562017-01-16 20:41:20 +090011134/*[clinic input]
11135str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000011136
INADA Naoki3ae20562017-01-16 20:41:20 +090011137Return a capitalized version of the string.
11138
11139More specifically, make the first character have upper case and the rest lower
11140case.
11141[clinic start generated code]*/
11142
11143static PyObject *
11144unicode_capitalize_impl(PyObject *self)
11145/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011146{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011147 if (PyUnicode_READY(self) == -1)
11148 return NULL;
11149 if (PyUnicode_GET_LENGTH(self) == 0)
11150 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011151 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011152}
11153
INADA Naoki3ae20562017-01-16 20:41:20 +090011154/*[clinic input]
11155str.casefold as unicode_casefold
11156
11157Return a version of the string suitable for caseless comparisons.
11158[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011159
11160static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011161unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011162/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011163{
11164 if (PyUnicode_READY(self) == -1)
11165 return NULL;
11166 if (PyUnicode_IS_ASCII(self))
11167 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011168 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050011169}
11170
11171
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011172/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011173
11174static int
11175convert_uc(PyObject *obj, void *addr)
11176{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011177 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011178
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011179 if (!PyUnicode_Check(obj)) {
11180 PyErr_Format(PyExc_TypeError,
11181 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020011182 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011183 return 0;
11184 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011185 if (PyUnicode_READY(obj) < 0)
11186 return 0;
11187 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011188 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011189 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000011190 return 0;
11191 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011192 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011193 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011194}
11195
INADA Naoki3ae20562017-01-16 20:41:20 +090011196/*[clinic input]
11197str.center as unicode_center
11198
11199 width: Py_ssize_t
11200 fillchar: Py_UCS4 = ' '
11201 /
11202
11203Return a centered string of length width.
11204
11205Padding is done using the specified fill character (default is a space).
11206[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207
11208static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011209unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11210/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011211{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011212 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011213
Benjamin Petersonbac79492012-01-14 13:34:47 -050011214 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011215 return NULL;
11216
Victor Stinnerc4b49542011-12-11 22:44:26 +010011217 if (PyUnicode_GET_LENGTH(self) >= width)
11218 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011219
Victor Stinnerc4b49542011-12-11 22:44:26 +010011220 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011221 left = marg / 2 + (marg & width & 1);
11222
Victor Stinner9310abb2011-10-05 00:59:23 +020011223 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011224}
11225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011226/* This function assumes that str1 and str2 are readied by the caller. */
11227
Marc-André Lemburge5034372000-08-08 08:04:29 +000011228static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011229unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000011230{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011231#define COMPARE(TYPE1, TYPE2) \
11232 do { \
11233 TYPE1* p1 = (TYPE1 *)data1; \
11234 TYPE2* p2 = (TYPE2 *)data2; \
11235 TYPE1* end = p1 + len; \
11236 Py_UCS4 c1, c2; \
11237 for (; p1 != end; p1++, p2++) { \
11238 c1 = *p1; \
11239 c2 = *p2; \
11240 if (c1 != c2) \
11241 return (c1 < c2) ? -1 : 1; \
11242 } \
11243 } \
11244 while (0)
11245
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011246 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011247 const void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011248 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011249
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011250 kind1 = PyUnicode_KIND(str1);
11251 kind2 = PyUnicode_KIND(str2);
11252 data1 = PyUnicode_DATA(str1);
11253 data2 = PyUnicode_DATA(str2);
11254 len1 = PyUnicode_GET_LENGTH(str1);
11255 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011256 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011257
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011258 switch(kind1) {
11259 case PyUnicode_1BYTE_KIND:
11260 {
11261 switch(kind2) {
11262 case PyUnicode_1BYTE_KIND:
11263 {
11264 int cmp = memcmp(data1, data2, len);
11265 /* normalize result of memcmp() into the range [-1; 1] */
11266 if (cmp < 0)
11267 return -1;
11268 if (cmp > 0)
11269 return 1;
11270 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011271 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011272 case PyUnicode_2BYTE_KIND:
11273 COMPARE(Py_UCS1, Py_UCS2);
11274 break;
11275 case PyUnicode_4BYTE_KIND:
11276 COMPARE(Py_UCS1, Py_UCS4);
11277 break;
11278 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011279 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011280 }
11281 break;
11282 }
11283 case PyUnicode_2BYTE_KIND:
11284 {
11285 switch(kind2) {
11286 case PyUnicode_1BYTE_KIND:
11287 COMPARE(Py_UCS2, Py_UCS1);
11288 break;
11289 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011290 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011291 COMPARE(Py_UCS2, Py_UCS2);
11292 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011293 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011294 case PyUnicode_4BYTE_KIND:
11295 COMPARE(Py_UCS2, Py_UCS4);
11296 break;
11297 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011298 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011299 }
11300 break;
11301 }
11302 case PyUnicode_4BYTE_KIND:
11303 {
11304 switch(kind2) {
11305 case PyUnicode_1BYTE_KIND:
11306 COMPARE(Py_UCS4, Py_UCS1);
11307 break;
11308 case PyUnicode_2BYTE_KIND:
11309 COMPARE(Py_UCS4, Py_UCS2);
11310 break;
11311 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011312 {
11313#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11314 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11315 /* normalize result of wmemcmp() into the range [-1; 1] */
11316 if (cmp < 0)
11317 return -1;
11318 if (cmp > 0)
11319 return 1;
11320#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011321 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011322#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011323 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011324 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011325 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011326 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011327 }
11328 break;
11329 }
11330 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011331 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011332 }
11333
Victor Stinner770e19e2012-10-04 22:59:45 +020011334 if (len1 == len2)
11335 return 0;
11336 if (len1 < len2)
11337 return -1;
11338 else
11339 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011340
11341#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011342}
11343
Benjamin Peterson621b4302016-09-09 13:54:34 -070011344static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011345unicode_compare_eq(PyObject *str1, PyObject *str2)
11346{
11347 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011348 const void *data1, *data2;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011349 Py_ssize_t len;
11350 int cmp;
11351
Victor Stinnere5567ad2012-10-23 02:48:49 +020011352 len = PyUnicode_GET_LENGTH(str1);
11353 if (PyUnicode_GET_LENGTH(str2) != len)
11354 return 0;
11355 kind = PyUnicode_KIND(str1);
11356 if (PyUnicode_KIND(str2) != kind)
11357 return 0;
11358 data1 = PyUnicode_DATA(str1);
11359 data2 = PyUnicode_DATA(str2);
11360
11361 cmp = memcmp(data1, data2, len * kind);
11362 return (cmp == 0);
11363}
11364
11365
Alexander Belopolsky40018472011-02-26 01:02:56 +000011366int
11367PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011368{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011369 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11370 if (PyUnicode_READY(left) == -1 ||
11371 PyUnicode_READY(right) == -1)
11372 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011373
11374 /* a string is equal to itself */
11375 if (left == right)
11376 return 0;
11377
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011378 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011379 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011380 PyErr_Format(PyExc_TypeError,
11381 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011382 Py_TYPE(left)->tp_name,
11383 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011384 return -1;
11385}
11386
Martin v. Löwis5b222132007-06-10 09:51:05 +000011387int
11388PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11389{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011390 Py_ssize_t i;
11391 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011392 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011393 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011394
Victor Stinner910337b2011-10-03 03:20:16 +020011395 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011396 if (!PyUnicode_IS_READY(uni)) {
11397 const wchar_t *ws = _PyUnicode_WSTR(uni);
11398 /* Compare Unicode string and source character set string */
11399 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11400 if (chr != ustr[i])
11401 return (chr < ustr[i]) ? -1 : 1;
11402 }
11403 /* This check keeps Python strings that end in '\0' from comparing equal
11404 to C strings identical up to that point. */
11405 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11406 return 1; /* uni is longer */
11407 if (ustr[i])
11408 return -1; /* str is longer */
11409 return 0;
11410 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011411 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011412 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011413 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011414 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011415 size_t len, len2 = strlen(str);
11416 int cmp;
11417
11418 len = Py_MIN(len1, len2);
11419 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011420 if (cmp != 0) {
11421 if (cmp < 0)
11422 return -1;
11423 else
11424 return 1;
11425 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011426 if (len1 > len2)
11427 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011428 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011429 return -1; /* str is longer */
11430 return 0;
11431 }
11432 else {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011433 const void *data = PyUnicode_DATA(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011434 /* Compare Unicode string and source character set string */
11435 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011436 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011437 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11438 /* This check keeps Python strings that end in '\0' from comparing equal
11439 to C strings identical up to that point. */
11440 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11441 return 1; /* uni is longer */
11442 if (str[i])
11443 return -1; /* str is longer */
11444 return 0;
11445 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011446}
11447
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011448static int
11449non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11450{
11451 size_t i, len;
11452 const wchar_t *p;
11453 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11454 if (strlen(str) != len)
11455 return 0;
11456 p = _PyUnicode_WSTR(unicode);
11457 assert(p);
11458 for (i = 0; i < len; i++) {
11459 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011460 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011461 return 0;
11462 }
11463 return 1;
11464}
11465
11466int
11467_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11468{
11469 size_t len;
11470 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011471 assert(str);
11472#ifndef NDEBUG
11473 for (const char *p = str; *p; p++) {
11474 assert((unsigned char)*p < 128);
11475 }
11476#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011477 if (PyUnicode_READY(unicode) == -1) {
11478 /* Memory error or bad data */
11479 PyErr_Clear();
11480 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11481 }
11482 if (!PyUnicode_IS_ASCII(unicode))
11483 return 0;
11484 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11485 return strlen(str) == len &&
11486 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11487}
11488
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011489int
11490_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11491{
11492 PyObject *right_uni;
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011493
11494 assert(_PyUnicode_CHECK(left));
11495 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011496#ifndef NDEBUG
11497 for (const char *p = right->string; *p; p++) {
11498 assert((unsigned char)*p < 128);
11499 }
11500#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011501
11502 if (PyUnicode_READY(left) == -1) {
11503 /* memory error or bad data */
11504 PyErr_Clear();
11505 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11506 }
11507
11508 if (!PyUnicode_IS_ASCII(left))
11509 return 0;
11510
11511 right_uni = _PyUnicode_FromId(right); /* borrowed */
11512 if (right_uni == NULL) {
11513 /* memory error or bad data */
11514 PyErr_Clear();
11515 return _PyUnicode_EqualToASCIIString(left, right->string);
11516 }
11517
11518 if (left == right_uni)
11519 return 1;
11520
11521 if (PyUnicode_CHECK_INTERNED(left))
11522 return 0;
11523
INADA Naoki7cc95f52018-01-28 02:07:09 +090011524 assert(_PyUnicode_HASH(right_uni) != -1);
Victor Stinner607b1022020-05-05 18:50:30 +020011525 Py_hash_t hash = _PyUnicode_HASH(left);
Victor Stinnerea251802020-12-26 02:58:33 +010011526 if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) {
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011527 return 0;
Victor Stinnerea251802020-12-26 02:58:33 +010011528 }
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011529
11530 return unicode_compare_eq(left, right_uni);
11531}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011532
Alexander Belopolsky40018472011-02-26 01:02:56 +000011533PyObject *
11534PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011535{
11536 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011537
Victor Stinnere5567ad2012-10-23 02:48:49 +020011538 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11539 Py_RETURN_NOTIMPLEMENTED;
11540
11541 if (PyUnicode_READY(left) == -1 ||
11542 PyUnicode_READY(right) == -1)
11543 return NULL;
11544
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011545 if (left == right) {
11546 switch (op) {
11547 case Py_EQ:
11548 case Py_LE:
11549 case Py_GE:
11550 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011551 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011552 case Py_NE:
11553 case Py_LT:
11554 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011555 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011556 default:
11557 PyErr_BadArgument();
11558 return NULL;
11559 }
11560 }
11561 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011562 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011563 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011564 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011565 }
11566 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011567 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011568 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011569 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011570}
11571
Alexander Belopolsky40018472011-02-26 01:02:56 +000011572int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011573_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11574{
11575 return unicode_eq(aa, bb);
11576}
11577
11578int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011579PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011580{
Victor Stinner77282cb2013-04-14 19:22:47 +020011581 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011582 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011583 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011584 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011585
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011586 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011587 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011588 "'in <string>' requires string as left operand, not %.100s",
11589 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011590 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011591 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011592 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011593 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011594 if (ensure_unicode(str) < 0)
11595 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011596
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011597 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011598 kind2 = PyUnicode_KIND(substr);
11599 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011600 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011601 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011602 len2 = PyUnicode_GET_LENGTH(substr);
11603 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011604 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011605 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011606 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011607 if (len2 == 1) {
11608 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11609 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011610 return result;
11611 }
11612 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011613 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011614 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011615 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011616 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011617
Victor Stinner77282cb2013-04-14 19:22:47 +020011618 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011619 case PyUnicode_1BYTE_KIND:
11620 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11621 break;
11622 case PyUnicode_2BYTE_KIND:
11623 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11624 break;
11625 case PyUnicode_4BYTE_KIND:
11626 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11627 break;
11628 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011629 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011630 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011631
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011632 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
Victor Stinner77282cb2013-04-14 19:22:47 +020011633 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011634 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011635
Guido van Rossum403d68b2000-03-13 15:55:09 +000011636 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011637}
11638
Guido van Rossumd57fd912000-03-10 22:53:23 +000011639/* Concat to string or Unicode object giving a new Unicode object. */
11640
Alexander Belopolsky40018472011-02-26 01:02:56 +000011641PyObject *
11642PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011643{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011644 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011645 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011646 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011647
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011648 if (ensure_unicode(left) < 0)
11649 return NULL;
11650
11651 if (!PyUnicode_Check(right)) {
11652 PyErr_Format(PyExc_TypeError,
11653 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011654 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011655 return NULL;
11656 }
11657 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011658 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011659
11660 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011661 PyObject *empty = unicode_get_empty(); // Borrowed reference
11662 if (left == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011663 return PyUnicode_FromObject(right);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011664 }
11665 if (right == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011666 return PyUnicode_FromObject(left);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011667 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011668
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011669 left_len = PyUnicode_GET_LENGTH(left);
11670 right_len = PyUnicode_GET_LENGTH(right);
11671 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011672 PyErr_SetString(PyExc_OverflowError,
11673 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011674 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011675 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011676 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011677
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011678 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11679 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011680 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011681
Guido van Rossumd57fd912000-03-10 22:53:23 +000011682 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011683 result = PyUnicode_New(new_len, maxchar);
11684 if (result == NULL)
11685 return NULL;
11686 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11687 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11688 assert(_PyUnicode_CheckConsistency(result, 1));
11689 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011690}
11691
Walter Dörwald1ab83302007-05-18 17:15:44 +000011692void
Victor Stinner23e56682011-10-03 03:54:37 +020011693PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011694{
Victor Stinner23e56682011-10-03 03:54:37 +020011695 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011696 Py_UCS4 maxchar, maxchar2;
11697 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011698
11699 if (p_left == NULL) {
11700 if (!PyErr_Occurred())
11701 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011702 return;
11703 }
Victor Stinner23e56682011-10-03 03:54:37 +020011704 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011705 if (right == NULL || left == NULL
11706 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011707 if (!PyErr_Occurred())
11708 PyErr_BadInternalCall();
11709 goto error;
11710 }
11711
Benjamin Petersonbac79492012-01-14 13:34:47 -050011712 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011713 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011714 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011715 goto error;
11716
Victor Stinner488fa492011-12-12 00:01:39 +010011717 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011718 PyObject *empty = unicode_get_empty(); // Borrowed reference
11719 if (left == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011720 Py_DECREF(left);
11721 Py_INCREF(right);
11722 *p_left = right;
11723 return;
11724 }
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011725 if (right == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011726 return;
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011727 }
Victor Stinner488fa492011-12-12 00:01:39 +010011728
11729 left_len = PyUnicode_GET_LENGTH(left);
11730 right_len = PyUnicode_GET_LENGTH(right);
11731 if (left_len > PY_SSIZE_T_MAX - right_len) {
11732 PyErr_SetString(PyExc_OverflowError,
11733 "strings are too large to concat");
11734 goto error;
11735 }
11736 new_len = left_len + right_len;
11737
11738 if (unicode_modifiable(left)
11739 && PyUnicode_CheckExact(right)
11740 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011741 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11742 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011743 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011744 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011745 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11746 {
11747 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011748 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011749 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011750
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011751 /* copy 'right' into the newly allocated area of 'left' */
11752 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011753 }
Victor Stinner488fa492011-12-12 00:01:39 +010011754 else {
11755 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11756 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011757 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011758
Victor Stinner488fa492011-12-12 00:01:39 +010011759 /* Concat the two Unicode strings */
11760 res = PyUnicode_New(new_len, maxchar);
11761 if (res == NULL)
11762 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011763 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11764 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011765 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011766 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011767 }
11768 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011769 return;
11770
11771error:
Victor Stinner488fa492011-12-12 00:01:39 +010011772 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011773}
11774
11775void
11776PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11777{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011778 PyUnicode_Append(pleft, right);
11779 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011780}
11781
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011782/*
11783Wraps stringlib_parse_args_finds() and additionally ensures that the
11784first argument is a unicode object.
11785*/
11786
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011787static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011788parse_args_finds_unicode(const char * function_name, PyObject *args,
11789 PyObject **substring,
11790 Py_ssize_t *start, Py_ssize_t *end)
11791{
11792 if(stringlib_parse_args_finds(function_name, args, substring,
11793 start, end)) {
11794 if (ensure_unicode(*substring) < 0)
11795 return 0;
11796 return 1;
11797 }
11798 return 0;
11799}
11800
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011801PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011802 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011804Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011805string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011806interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011807
11808static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011809unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011810{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011811 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011812 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011813 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011814 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011815 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011816 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011817 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011818
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011819 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011820 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011821
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011822 kind1 = PyUnicode_KIND(self);
11823 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011824 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011825 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011826
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011827 len1 = PyUnicode_GET_LENGTH(self);
11828 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011829 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011830 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011831 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011832
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011833 buf1 = PyUnicode_DATA(self);
11834 buf2 = PyUnicode_DATA(substring);
11835 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011836 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011837 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011838 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011839 }
11840 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011841 case PyUnicode_1BYTE_KIND:
11842 iresult = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011843 ((const Py_UCS1*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011844 buf2, len2, PY_SSIZE_T_MAX
11845 );
11846 break;
11847 case PyUnicode_2BYTE_KIND:
11848 iresult = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011849 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011850 buf2, len2, PY_SSIZE_T_MAX
11851 );
11852 break;
11853 case PyUnicode_4BYTE_KIND:
11854 iresult = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011855 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011856 buf2, len2, PY_SSIZE_T_MAX
11857 );
11858 break;
11859 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011860 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011861 }
11862
11863 result = PyLong_FromSsize_t(iresult);
11864
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011865 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011866 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011867 PyMem_Free((void *)buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011868
Guido van Rossumd57fd912000-03-10 22:53:23 +000011869 return result;
11870}
11871
INADA Naoki3ae20562017-01-16 20:41:20 +090011872/*[clinic input]
11873str.encode as unicode_encode
11874
11875 encoding: str(c_default="NULL") = 'utf-8'
11876 The encoding in which to encode the string.
11877 errors: str(c_default="NULL") = 'strict'
11878 The error handling scheme to use for encoding errors.
11879 The default is 'strict' meaning that encoding errors raise a
11880 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11881 'xmlcharrefreplace' as well as any other name registered with
11882 codecs.register_error that can handle UnicodeEncodeErrors.
11883
11884Encode the string using the codec registered for encoding.
11885[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886
11887static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011888unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011889/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011890{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011891 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011892}
11893
INADA Naoki3ae20562017-01-16 20:41:20 +090011894/*[clinic input]
11895str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011896
INADA Naoki3ae20562017-01-16 20:41:20 +090011897 tabsize: int = 8
11898
11899Return a copy where all tab characters are expanded using spaces.
11900
11901If tabsize is not given, a tab size of 8 characters is assumed.
11902[clinic start generated code]*/
11903
11904static PyObject *
11905unicode_expandtabs_impl(PyObject *self, int tabsize)
11906/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011908 Py_ssize_t i, j, line_pos, src_len, incr;
11909 Py_UCS4 ch;
11910 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011911 const void *src_data;
11912 void *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011913 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011914 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915
Antoine Pitrou22425222011-10-04 19:10:51 +020011916 if (PyUnicode_READY(self) == -1)
11917 return NULL;
11918
Thomas Wouters7e474022000-07-16 12:04:32 +000011919 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011920 src_len = PyUnicode_GET_LENGTH(self);
11921 i = j = line_pos = 0;
11922 kind = PyUnicode_KIND(self);
11923 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011924 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011925 for (; i < src_len; i++) {
11926 ch = PyUnicode_READ(kind, src_data, i);
11927 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011928 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011929 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011930 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011931 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011932 goto overflow;
11933 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011934 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011935 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011936 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011937 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011938 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011939 goto overflow;
11940 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011941 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011942 if (ch == '\n' || ch == '\r')
11943 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011944 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011945 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011946 if (!found)
11947 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011948
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011950 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011951 if (!u)
11952 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011953 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011954
Antoine Pitroue71d5742011-10-04 15:55:09 +020011955 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011956
Antoine Pitroue71d5742011-10-04 15:55:09 +020011957 for (; i < src_len; i++) {
11958 ch = PyUnicode_READ(kind, src_data, i);
11959 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011960 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011961 incr = tabsize - (line_pos % tabsize);
11962 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011963 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011964 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011965 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011966 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011967 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011968 line_pos++;
11969 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011970 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011971 if (ch == '\n' || ch == '\r')
11972 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011973 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011974 }
11975 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011976 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011977
Antoine Pitroue71d5742011-10-04 15:55:09 +020011978 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011979 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11980 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011981}
11982
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011983PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011984 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011985\n\
11986Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011987such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011988arguments start and end are interpreted as in slice notation.\n\
11989\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011990Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011991
11992static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011993unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011994{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011995 /* initialize variables to prevent gcc warning */
11996 PyObject *substring = NULL;
11997 Py_ssize_t start = 0;
11998 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011999 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012000
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012001 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012002 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012003
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012004 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012005 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012006
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012007 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012009 if (result == -2)
12010 return NULL;
12011
Christian Heimes217cfd12007-12-02 14:31:20 +000012012 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012013}
12014
12015static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012016unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012017{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012018 const void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012019 enum PyUnicode_Kind kind;
12020 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012021
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030012022 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012023 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012024 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012025 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030012026 if (PyUnicode_READY(self) == -1) {
12027 return NULL;
12028 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012029 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
12030 PyErr_SetString(PyExc_IndexError, "string index out of range");
12031 return NULL;
12032 }
12033 kind = PyUnicode_KIND(self);
12034 data = PyUnicode_DATA(self);
12035 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010012036 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012037}
12038
Guido van Rossumc2504932007-09-18 19:42:40 +000012039/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010012040 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000012041static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012042unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012043{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080012044 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000012045
Benjamin Petersonf6622c82012-04-09 14:53:07 -040012046#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050012047 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040012048#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012049 if (_PyUnicode_HASH(self) != -1)
12050 return _PyUnicode_HASH(self);
12051 if (PyUnicode_READY(self) == -1)
12052 return -1;
animalizea1d14252019-01-02 20:16:06 +080012053
Christian Heimes985ecdc2013-11-20 11:46:18 +010012054 x = _Py_HashBytes(PyUnicode_DATA(self),
12055 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012056 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000012057 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012058}
12059
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012060PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012061 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062\n\
oldkaa0735f2018-02-02 16:52:55 +080012063Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012064such that sub is contained within S[start:end]. Optional\n\
12065arguments start and end are interpreted as in slice notation.\n\
12066\n\
12067Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012068
12069static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012070unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012071{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012072 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000012073 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012074 PyObject *substring = NULL;
12075 Py_ssize_t start = 0;
12076 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012078 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012079 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012080
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012081 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012082 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012083
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012084 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012085
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012086 if (result == -2)
12087 return NULL;
12088
Guido van Rossumd57fd912000-03-10 22:53:23 +000012089 if (result < 0) {
12090 PyErr_SetString(PyExc_ValueError, "substring not found");
12091 return NULL;
12092 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012093
Christian Heimes217cfd12007-12-02 14:31:20 +000012094 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095}
12096
INADA Naoki3ae20562017-01-16 20:41:20 +090012097/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090012098str.isascii as unicode_isascii
12099
12100Return True if all characters in the string are ASCII, False otherwise.
12101
12102ASCII characters have code points in the range U+0000-U+007F.
12103Empty string is ASCII too.
12104[clinic start generated code]*/
12105
12106static PyObject *
12107unicode_isascii_impl(PyObject *self)
12108/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
12109{
12110 if (PyUnicode_READY(self) == -1) {
12111 return NULL;
12112 }
12113 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
12114}
12115
12116/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090012117str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012118
INADA Naoki3ae20562017-01-16 20:41:20 +090012119Return True if the string is a lowercase string, False otherwise.
12120
12121A string is lowercase if all cased characters in the string are lowercase and
12122there is at least one cased character in the string.
12123[clinic start generated code]*/
12124
12125static PyObject *
12126unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012127/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012128{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129 Py_ssize_t i, length;
12130 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012131 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012132 int cased;
12133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012134 if (PyUnicode_READY(self) == -1)
12135 return NULL;
12136 length = PyUnicode_GET_LENGTH(self);
12137 kind = PyUnicode_KIND(self);
12138 data = PyUnicode_DATA(self);
12139
Guido van Rossumd57fd912000-03-10 22:53:23 +000012140 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012141 if (length == 1)
12142 return PyBool_FromLong(
12143 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012144
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012145 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012146 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012147 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012148
Guido van Rossumd57fd912000-03-10 22:53:23 +000012149 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012150 for (i = 0; i < length; i++) {
12151 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012152
Benjamin Peterson29060642009-01-31 22:14:21 +000012153 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012154 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012155 else if (!cased && Py_UNICODE_ISLOWER(ch))
12156 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012157 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012158 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012159}
12160
INADA Naoki3ae20562017-01-16 20:41:20 +090012161/*[clinic input]
12162str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000012163
INADA Naoki3ae20562017-01-16 20:41:20 +090012164Return True if the string is an uppercase string, False otherwise.
12165
12166A string is uppercase if all cased characters in the string are uppercase and
12167there is at least one cased character in the string.
12168[clinic start generated code]*/
12169
12170static PyObject *
12171unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012172/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012173{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012174 Py_ssize_t i, length;
12175 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012176 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012177 int cased;
12178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012179 if (PyUnicode_READY(self) == -1)
12180 return NULL;
12181 length = PyUnicode_GET_LENGTH(self);
12182 kind = PyUnicode_KIND(self);
12183 data = PyUnicode_DATA(self);
12184
Guido van Rossumd57fd912000-03-10 22:53:23 +000012185 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012186 if (length == 1)
12187 return PyBool_FromLong(
12188 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012189
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012190 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012191 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012192 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012193
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012195 for (i = 0; i < length; i++) {
12196 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012197
Benjamin Peterson29060642009-01-31 22:14:21 +000012198 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012199 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012200 else if (!cased && Py_UNICODE_ISUPPER(ch))
12201 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012202 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012203 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012204}
12205
INADA Naoki3ae20562017-01-16 20:41:20 +090012206/*[clinic input]
12207str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000012208
INADA Naoki3ae20562017-01-16 20:41:20 +090012209Return True if the string is a title-cased string, False otherwise.
12210
12211In a title-cased string, upper- and title-case characters may only
12212follow uncased characters and lowercase characters only cased ones.
12213[clinic start generated code]*/
12214
12215static PyObject *
12216unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012217/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012218{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012219 Py_ssize_t i, length;
12220 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012221 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012222 int cased, previous_is_cased;
12223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012224 if (PyUnicode_READY(self) == -1)
12225 return NULL;
12226 length = PyUnicode_GET_LENGTH(self);
12227 kind = PyUnicode_KIND(self);
12228 data = PyUnicode_DATA(self);
12229
Guido van Rossumd57fd912000-03-10 22:53:23 +000012230 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012231 if (length == 1) {
12232 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12233 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12234 (Py_UNICODE_ISUPPER(ch) != 0));
12235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012236
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012237 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012238 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012239 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012240
Guido van Rossumd57fd912000-03-10 22:53:23 +000012241 cased = 0;
12242 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012243 for (i = 0; i < length; i++) {
12244 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012245
Benjamin Peterson29060642009-01-31 22:14:21 +000012246 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12247 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012248 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012249 previous_is_cased = 1;
12250 cased = 1;
12251 }
12252 else if (Py_UNICODE_ISLOWER(ch)) {
12253 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012254 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012255 previous_is_cased = 1;
12256 cased = 1;
12257 }
12258 else
12259 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012260 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012261 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012262}
12263
INADA Naoki3ae20562017-01-16 20:41:20 +090012264/*[clinic input]
12265str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012266
INADA Naoki3ae20562017-01-16 20:41:20 +090012267Return True if the string is a whitespace string, False otherwise.
12268
12269A string is whitespace if all characters in the string are whitespace and there
12270is at least one character in the string.
12271[clinic start generated code]*/
12272
12273static PyObject *
12274unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012275/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012277 Py_ssize_t i, length;
12278 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012279 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012280
12281 if (PyUnicode_READY(self) == -1)
12282 return NULL;
12283 length = PyUnicode_GET_LENGTH(self);
12284 kind = PyUnicode_KIND(self);
12285 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012286
Guido van Rossumd57fd912000-03-10 22:53:23 +000012287 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012288 if (length == 1)
12289 return PyBool_FromLong(
12290 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012291
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012292 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012293 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012294 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012296 for (i = 0; i < length; i++) {
12297 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012298 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012299 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012300 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012301 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012302}
12303
INADA Naoki3ae20562017-01-16 20:41:20 +090012304/*[clinic input]
12305str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012306
INADA Naoki3ae20562017-01-16 20:41:20 +090012307Return True if the string is an alphabetic string, False otherwise.
12308
12309A string is alphabetic if all characters in the string are alphabetic and there
12310is at least one character in the string.
12311[clinic start generated code]*/
12312
12313static PyObject *
12314unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012315/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012316{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012317 Py_ssize_t i, length;
12318 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012319 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012320
12321 if (PyUnicode_READY(self) == -1)
12322 return NULL;
12323 length = PyUnicode_GET_LENGTH(self);
12324 kind = PyUnicode_KIND(self);
12325 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012326
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012327 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012328 if (length == 1)
12329 return PyBool_FromLong(
12330 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012331
12332 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012333 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012334 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012335
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012336 for (i = 0; i < length; i++) {
12337 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012338 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012339 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012340 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012341}
12342
INADA Naoki3ae20562017-01-16 20:41:20 +090012343/*[clinic input]
12344str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012345
INADA Naoki3ae20562017-01-16 20:41:20 +090012346Return True if the string is an alpha-numeric string, False otherwise.
12347
12348A string is alpha-numeric if all characters in the string are alpha-numeric and
12349there is at least one character in the string.
12350[clinic start generated code]*/
12351
12352static PyObject *
12353unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012354/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012355{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012356 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012357 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012358 Py_ssize_t len, i;
12359
12360 if (PyUnicode_READY(self) == -1)
12361 return NULL;
12362
12363 kind = PyUnicode_KIND(self);
12364 data = PyUnicode_DATA(self);
12365 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012366
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012367 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012368 if (len == 1) {
12369 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12370 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12371 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012372
12373 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012374 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012375 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012376
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012377 for (i = 0; i < len; i++) {
12378 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012379 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012380 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012381 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012382 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012383}
12384
INADA Naoki3ae20562017-01-16 20:41:20 +090012385/*[clinic input]
12386str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012387
INADA Naoki3ae20562017-01-16 20:41:20 +090012388Return True if the string is a decimal string, False otherwise.
12389
12390A string is a decimal string if all characters in the string are decimal and
12391there is at least one character in the string.
12392[clinic start generated code]*/
12393
12394static PyObject *
12395unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012396/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012397{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012398 Py_ssize_t i, length;
12399 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012400 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012401
12402 if (PyUnicode_READY(self) == -1)
12403 return NULL;
12404 length = PyUnicode_GET_LENGTH(self);
12405 kind = PyUnicode_KIND(self);
12406 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012407
Guido van Rossumd57fd912000-03-10 22:53:23 +000012408 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012409 if (length == 1)
12410 return PyBool_FromLong(
12411 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012412
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012413 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012414 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012415 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012416
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012417 for (i = 0; i < length; i++) {
12418 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012419 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012420 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012421 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012422}
12423
INADA Naoki3ae20562017-01-16 20:41:20 +090012424/*[clinic input]
12425str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012426
INADA Naoki3ae20562017-01-16 20:41:20 +090012427Return True if the string is a digit string, False otherwise.
12428
12429A string is a digit string if all characters in the string are digits and there
12430is at least one character in the string.
12431[clinic start generated code]*/
12432
12433static PyObject *
12434unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012435/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012436{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012437 Py_ssize_t i, length;
12438 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012439 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012440
12441 if (PyUnicode_READY(self) == -1)
12442 return NULL;
12443 length = PyUnicode_GET_LENGTH(self);
12444 kind = PyUnicode_KIND(self);
12445 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012446
Guido van Rossumd57fd912000-03-10 22:53:23 +000012447 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012448 if (length == 1) {
12449 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12450 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12451 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012452
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012453 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012454 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012455 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012456
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012457 for (i = 0; i < length; i++) {
12458 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012459 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012460 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012461 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012462}
12463
INADA Naoki3ae20562017-01-16 20:41:20 +090012464/*[clinic input]
12465str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012466
INADA Naoki3ae20562017-01-16 20:41:20 +090012467Return True if the string is a numeric string, False otherwise.
12468
12469A string is numeric if all characters in the string are numeric and there is at
12470least one character in the string.
12471[clinic start generated code]*/
12472
12473static PyObject *
12474unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012475/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012476{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012477 Py_ssize_t i, length;
12478 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012479 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012480
12481 if (PyUnicode_READY(self) == -1)
12482 return NULL;
12483 length = PyUnicode_GET_LENGTH(self);
12484 kind = PyUnicode_KIND(self);
12485 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012486
Guido van Rossumd57fd912000-03-10 22:53:23 +000012487 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012488 if (length == 1)
12489 return PyBool_FromLong(
12490 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012491
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012492 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012493 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012494 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012495
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012496 for (i = 0; i < length; i++) {
12497 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012498 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012499 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012500 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012501}
12502
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012503Py_ssize_t
12504_PyUnicode_ScanIdentifier(PyObject *self)
Martin v. Löwis47383402007-08-15 07:32:56 +000012505{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012506 Py_ssize_t i;
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012507 if (PyUnicode_READY(self) == -1)
12508 return -1;
Martin v. Löwis47383402007-08-15 07:32:56 +000012509
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012510 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012511 if (len == 0) {
12512 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012513 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012514 }
12515
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012516 int kind = PyUnicode_KIND(self);
12517 const void *data = PyUnicode_DATA(self);
12518 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Martin v. Löwis47383402007-08-15 07:32:56 +000012519 /* PEP 3131 says that the first character must be in
12520 XID_Start and subsequent characters in XID_Continue,
12521 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012522 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012523 letters, digits, underscore). However, given the current
12524 definition of XID_Start and XID_Continue, it is sufficient
12525 to check just for these, except that _ must be allowed
12526 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012527 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012528 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012529 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012530
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012531 for (i = 1; i < len; i++) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012532 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012533 if (!_PyUnicode_IsXidContinue(ch)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012534 return i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012535 }
12536 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012537 return i;
12538}
12539
12540int
12541PyUnicode_IsIdentifier(PyObject *self)
12542{
12543 if (PyUnicode_IS_READY(self)) {
12544 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12545 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12546 /* an empty string is not a valid identifier */
12547 return len && i == len;
12548 }
12549 else {
Inada Naoki2c4928d2020-06-17 20:09:44 +090012550_Py_COMP_DIAG_PUSH
12551_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012552 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012553 if (len == 0) {
12554 /* an empty string is not a valid identifier */
12555 return 0;
12556 }
12557
12558 const wchar_t *wstr = _PyUnicode_WSTR(self);
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012559 Py_UCS4 ch = wstr[i++];
12560#if SIZEOF_WCHAR_T == 2
12561 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12562 && i < len
12563 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12564 {
12565 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12566 i++;
12567 }
12568#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012569 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12570 return 0;
12571 }
12572
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012573 while (i < len) {
12574 ch = wstr[i++];
12575#if SIZEOF_WCHAR_T == 2
12576 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12577 && i < len
12578 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12579 {
12580 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12581 i++;
12582 }
12583#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012584 if (!_PyUnicode_IsXidContinue(ch)) {
12585 return 0;
12586 }
12587 }
12588 return 1;
Inada Naoki2c4928d2020-06-17 20:09:44 +090012589_Py_COMP_DIAG_POP
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012590 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012591}
12592
INADA Naoki3ae20562017-01-16 20:41:20 +090012593/*[clinic input]
12594str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012595
INADA Naoki3ae20562017-01-16 20:41:20 +090012596Return True if the string is a valid Python identifier, False otherwise.
12597
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012598Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012599such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012600[clinic start generated code]*/
12601
12602static PyObject *
12603unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012604/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012605{
12606 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12607}
12608
INADA Naoki3ae20562017-01-16 20:41:20 +090012609/*[clinic input]
12610str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012611
INADA Naoki3ae20562017-01-16 20:41:20 +090012612Return True if the string is printable, False otherwise.
12613
12614A string is printable if all of its characters are considered printable in
12615repr() or if it is empty.
12616[clinic start generated code]*/
12617
12618static PyObject *
12619unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012620/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012621{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012622 Py_ssize_t i, length;
12623 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012624 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012625
12626 if (PyUnicode_READY(self) == -1)
12627 return NULL;
12628 length = PyUnicode_GET_LENGTH(self);
12629 kind = PyUnicode_KIND(self);
12630 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012631
12632 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012633 if (length == 1)
12634 return PyBool_FromLong(
12635 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012637 for (i = 0; i < length; i++) {
12638 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012639 Py_RETURN_FALSE;
12640 }
12641 }
12642 Py_RETURN_TRUE;
12643}
12644
INADA Naoki3ae20562017-01-16 20:41:20 +090012645/*[clinic input]
12646str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012647
INADA Naoki3ae20562017-01-16 20:41:20 +090012648 iterable: object
12649 /
12650
12651Concatenate any number of strings.
12652
Martin Panter91a88662017-01-24 00:30:06 +000012653The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012654The result is returned as a new string.
12655
12656Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12657[clinic start generated code]*/
12658
12659static PyObject *
12660unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012661/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012662{
INADA Naoki3ae20562017-01-16 20:41:20 +090012663 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012664}
12665
Martin v. Löwis18e16552006-02-15 17:27:45 +000012666static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012667unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012668{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012669 if (PyUnicode_READY(self) == -1)
12670 return -1;
12671 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012672}
12673
INADA Naoki3ae20562017-01-16 20:41:20 +090012674/*[clinic input]
12675str.ljust as unicode_ljust
12676
12677 width: Py_ssize_t
12678 fillchar: Py_UCS4 = ' '
12679 /
12680
12681Return a left-justified string of length width.
12682
12683Padding is done using the specified fill character (default is a space).
12684[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012685
12686static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012687unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12688/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012689{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012690 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012691 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012692
Victor Stinnerc4b49542011-12-11 22:44:26 +010012693 if (PyUnicode_GET_LENGTH(self) >= width)
12694 return unicode_result_unchanged(self);
12695
12696 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012697}
12698
INADA Naoki3ae20562017-01-16 20:41:20 +090012699/*[clinic input]
12700str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012701
INADA Naoki3ae20562017-01-16 20:41:20 +090012702Return a copy of the string converted to lowercase.
12703[clinic start generated code]*/
12704
12705static PyObject *
12706unicode_lower_impl(PyObject *self)
12707/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012708{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012709 if (PyUnicode_READY(self) == -1)
12710 return NULL;
12711 if (PyUnicode_IS_ASCII(self))
12712 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012713 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012714}
12715
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012716#define LEFTSTRIP 0
12717#define RIGHTSTRIP 1
12718#define BOTHSTRIP 2
12719
12720/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012721static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012722
INADA Naoki3ae20562017-01-16 20:41:20 +090012723#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012724
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012725/* externally visible for str.strip(unicode) */
12726PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012727_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012728{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012729 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012730 int kind;
12731 Py_ssize_t i, j, len;
12732 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012733 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012735 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12736 return NULL;
12737
12738 kind = PyUnicode_KIND(self);
12739 data = PyUnicode_DATA(self);
12740 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012741 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012742 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12743 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012744 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012745
Benjamin Peterson14339b62009-01-31 16:36:08 +000012746 i = 0;
12747 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012748 while (i < len) {
12749 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12750 if (!BLOOM(sepmask, ch))
12751 break;
12752 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12753 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012754 i++;
12755 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012756 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012757
Benjamin Peterson14339b62009-01-31 16:36:08 +000012758 j = len;
12759 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012760 j--;
12761 while (j >= i) {
12762 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12763 if (!BLOOM(sepmask, ch))
12764 break;
12765 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12766 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012767 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012768 }
12769
Benjamin Peterson29060642009-01-31 22:14:21 +000012770 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012771 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012772
Victor Stinner7931d9a2011-11-04 00:22:48 +010012773 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012774}
12775
12776PyObject*
12777PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12778{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012779 const unsigned char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012780 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012781 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012782
Victor Stinnerde636f32011-10-01 03:55:54 +020012783 if (PyUnicode_READY(self) == -1)
12784 return NULL;
12785
Victor Stinner684d5fd2012-05-03 02:32:34 +020012786 length = PyUnicode_GET_LENGTH(self);
12787 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012788
Victor Stinner684d5fd2012-05-03 02:32:34 +020012789 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012790 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012791
Victor Stinnerde636f32011-10-01 03:55:54 +020012792 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012793 PyErr_SetString(PyExc_IndexError, "string index out of range");
12794 return NULL;
12795 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012796 if (start >= length || end < start)
12797 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012798
Victor Stinner684d5fd2012-05-03 02:32:34 +020012799 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012800 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012801 data = PyUnicode_1BYTE_DATA(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012802 return _PyUnicode_FromASCII((const char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012803 }
12804 else {
12805 kind = PyUnicode_KIND(self);
12806 data = PyUnicode_1BYTE_DATA(self);
12807 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012808 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012809 length);
12810 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012811}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012812
12813static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012814do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012815{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012816 Py_ssize_t len, i, j;
12817
12818 if (PyUnicode_READY(self) == -1)
12819 return NULL;
12820
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012821 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012822
Victor Stinnercc7af722013-04-09 22:39:24 +020012823 if (PyUnicode_IS_ASCII(self)) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012824 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Victor Stinnercc7af722013-04-09 22:39:24 +020012825
12826 i = 0;
12827 if (striptype != RIGHTSTRIP) {
12828 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012829 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012830 if (!_Py_ascii_whitespace[ch])
12831 break;
12832 i++;
12833 }
12834 }
12835
12836 j = len;
12837 if (striptype != LEFTSTRIP) {
12838 j--;
12839 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012840 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012841 if (!_Py_ascii_whitespace[ch])
12842 break;
12843 j--;
12844 }
12845 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012846 }
12847 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012848 else {
12849 int kind = PyUnicode_KIND(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012850 const void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012851
Victor Stinnercc7af722013-04-09 22:39:24 +020012852 i = 0;
12853 if (striptype != RIGHTSTRIP) {
12854 while (i < len) {
12855 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12856 if (!Py_UNICODE_ISSPACE(ch))
12857 break;
12858 i++;
12859 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012860 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012861
12862 j = len;
12863 if (striptype != LEFTSTRIP) {
12864 j--;
12865 while (j >= i) {
12866 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12867 if (!Py_UNICODE_ISSPACE(ch))
12868 break;
12869 j--;
12870 }
12871 j++;
12872 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012873 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012874
Victor Stinner7931d9a2011-11-04 00:22:48 +010012875 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012876}
12877
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012878
12879static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012880do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012881{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012882 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012883 if (PyUnicode_Check(sep))
12884 return _PyUnicode_XStrip(self, striptype, sep);
12885 else {
12886 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012887 "%s arg must be None or str",
12888 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012889 return NULL;
12890 }
12891 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012892
Benjamin Peterson14339b62009-01-31 16:36:08 +000012893 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012894}
12895
12896
INADA Naoki3ae20562017-01-16 20:41:20 +090012897/*[clinic input]
12898str.strip as unicode_strip
12899
12900 chars: object = None
12901 /
12902
Zachary Ware09895c22019-10-09 16:09:00 -050012903Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012904
12905If chars is given and not None, remove characters in chars instead.
12906[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012907
12908static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012909unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012910/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012911{
INADA Naoki3ae20562017-01-16 20:41:20 +090012912 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012913}
12914
12915
INADA Naoki3ae20562017-01-16 20:41:20 +090012916/*[clinic input]
12917str.lstrip as unicode_lstrip
12918
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012919 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012920 /
12921
12922Return a copy of the string with leading whitespace removed.
12923
12924If chars is given and not None, remove characters in chars instead.
12925[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012926
12927static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012928unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012929/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012930{
INADA Naoki3ae20562017-01-16 20:41:20 +090012931 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012932}
12933
12934
INADA Naoki3ae20562017-01-16 20:41:20 +090012935/*[clinic input]
12936str.rstrip as unicode_rstrip
12937
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012938 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012939 /
12940
12941Return a copy of the string with trailing whitespace removed.
12942
12943If chars is given and not None, remove characters in chars instead.
12944[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012945
12946static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012947unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012948/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012949{
INADA Naoki3ae20562017-01-16 20:41:20 +090012950 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012951}
12952
12953
Guido van Rossumd57fd912000-03-10 22:53:23 +000012954static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012955unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012956{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012957 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012958 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012959
Serhiy Storchaka05997252013-01-26 12:14:02 +020012960 if (len < 1)
12961 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012962
Victor Stinnerc4b49542011-12-11 22:44:26 +010012963 /* no repeat, return original string */
12964 if (len == 1)
12965 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012966
Benjamin Petersonbac79492012-01-14 13:34:47 -050012967 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012968 return NULL;
12969
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012970 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012971 PyErr_SetString(PyExc_OverflowError,
12972 "repeated string is too long");
12973 return NULL;
12974 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012975 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012976
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012977 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012978 if (!u)
12979 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012980 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012981
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012982 if (PyUnicode_GET_LENGTH(str) == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012983 int kind = PyUnicode_KIND(str);
12984 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012985 if (kind == PyUnicode_1BYTE_KIND) {
12986 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012987 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012988 }
12989 else if (kind == PyUnicode_2BYTE_KIND) {
12990 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012991 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012992 ucs2[n] = fill_char;
12993 } else {
12994 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12995 assert(kind == PyUnicode_4BYTE_KIND);
12996 for (n = 0; n < len; ++n)
12997 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012998 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012999 }
13000 else {
13001 /* number of characters copied this far */
13002 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013003 Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013004 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020013005 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013006 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000013007 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013008 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020013009 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013010 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000013011 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000013012 }
13013
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013014 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013015 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013016}
13017
Alexander Belopolsky40018472011-02-26 01:02:56 +000013018PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013019PyUnicode_Replace(PyObject *str,
13020 PyObject *substr,
13021 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000013022 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013023{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013024 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
13025 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013026 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013027 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013028}
13029
INADA Naoki3ae20562017-01-16 20:41:20 +090013030/*[clinic input]
13031str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000013032
INADA Naoki3ae20562017-01-16 20:41:20 +090013033 old: unicode
13034 new: unicode
13035 count: Py_ssize_t = -1
13036 Maximum number of occurrences to replace.
13037 -1 (the default value) means replace all occurrences.
13038 /
13039
13040Return a copy with all occurrences of substring old replaced by new.
13041
13042If the optional argument count is given, only the first count occurrences are
13043replaced.
13044[clinic start generated code]*/
13045
13046static PyObject *
13047unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
13048 Py_ssize_t count)
13049/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013050{
Benjamin Peterson22a29702012-01-02 09:00:30 -060013051 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013052 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090013053 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013054}
13055
sweeneydea81849b2020-04-22 17:05:48 -040013056/*[clinic input]
13057str.removeprefix as unicode_removeprefix
13058
13059 prefix: unicode
13060 /
13061
13062Return a str with the given prefix string removed if present.
13063
13064If the string starts with the prefix string, return string[len(prefix):].
13065Otherwise, return a copy of the original string.
13066[clinic start generated code]*/
13067
13068static PyObject *
13069unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
13070/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
13071{
13072 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
13073 if (match == -1) {
13074 return NULL;
13075 }
13076 if (match) {
13077 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
13078 PyUnicode_GET_LENGTH(self));
13079 }
13080 return unicode_result_unchanged(self);
13081}
13082
13083/*[clinic input]
13084str.removesuffix as unicode_removesuffix
13085
13086 suffix: unicode
13087 /
13088
13089Return a str with the given suffix string removed if present.
13090
13091If the string ends with the suffix string and that suffix is not empty,
13092return string[:-len(suffix)]. Otherwise, return a copy of the original
13093string.
13094[clinic start generated code]*/
13095
13096static PyObject *
13097unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
13098/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
13099{
13100 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
13101 if (match == -1) {
13102 return NULL;
13103 }
13104 if (match) {
13105 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
13106 - PyUnicode_GET_LENGTH(suffix));
13107 }
13108 return unicode_result_unchanged(self);
13109}
13110
Alexander Belopolsky40018472011-02-26 01:02:56 +000013111static PyObject *
13112unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013113{
Walter Dörwald79e913e2007-05-12 11:08:06 +000013114 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013115 Py_ssize_t isize;
13116 Py_ssize_t osize, squote, dquote, i, o;
13117 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020013118 int ikind, okind, unchanged;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013119 const void *idata;
13120 void *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000013121
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013122 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000013123 return NULL;
13124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013125 isize = PyUnicode_GET_LENGTH(unicode);
13126 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000013127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013128 /* Compute length of output, quote characters, and
13129 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020013130 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013131 max = 127;
13132 squote = dquote = 0;
13133 ikind = PyUnicode_KIND(unicode);
13134 for (i = 0; i < isize; i++) {
13135 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040013136 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013137 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040013138 case '\'': squote++; break;
13139 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013140 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040013141 incr = 2;
13142 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013143 default:
13144 /* Fast-path ASCII */
13145 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013146 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013147 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013148 ;
13149 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013150 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013151 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013152 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013153 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013154 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013155 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040013156 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013157 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040013158 if (osize > PY_SSIZE_T_MAX - incr) {
13159 PyErr_SetString(PyExc_OverflowError,
13160 "string is too long to generate repr");
13161 return NULL;
13162 }
13163 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013164 }
13165
13166 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020013167 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013168 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020013169 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013170 if (dquote)
13171 /* Both squote and dquote present. Use squote,
13172 and escape them */
13173 osize += squote;
13174 else
13175 quote = '"';
13176 }
Victor Stinner55c08782013-04-14 18:45:39 +020013177 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013178
13179 repr = PyUnicode_New(osize, max);
13180 if (repr == NULL)
13181 return NULL;
13182 okind = PyUnicode_KIND(repr);
13183 odata = PyUnicode_DATA(repr);
13184
13185 PyUnicode_WRITE(okind, odata, 0, quote);
13186 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020013187 if (unchanged) {
13188 _PyUnicode_FastCopyCharacters(repr, 1,
13189 unicode, 0,
13190 isize);
13191 }
13192 else {
13193 for (i = 0, o = 1; i < isize; i++) {
13194 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013195
Victor Stinner55c08782013-04-14 18:45:39 +020013196 /* Escape quotes and backslashes */
13197 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000013198 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013199 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020013200 continue;
13201 }
13202
13203 /* Map special whitespace to '\t', \n', '\r' */
13204 if (ch == '\t') {
13205 PyUnicode_WRITE(okind, odata, o++, '\\');
13206 PyUnicode_WRITE(okind, odata, o++, 't');
13207 }
13208 else if (ch == '\n') {
13209 PyUnicode_WRITE(okind, odata, o++, '\\');
13210 PyUnicode_WRITE(okind, odata, o++, 'n');
13211 }
13212 else if (ch == '\r') {
13213 PyUnicode_WRITE(okind, odata, o++, '\\');
13214 PyUnicode_WRITE(okind, odata, o++, 'r');
13215 }
13216
13217 /* Map non-printable US ASCII to '\xhh' */
13218 else if (ch < ' ' || ch == 0x7F) {
13219 PyUnicode_WRITE(okind, odata, o++, '\\');
13220 PyUnicode_WRITE(okind, odata, o++, 'x');
13221 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13222 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13223 }
13224
13225 /* Copy ASCII characters as-is */
13226 else if (ch < 0x7F) {
13227 PyUnicode_WRITE(okind, odata, o++, ch);
13228 }
13229
13230 /* Non-ASCII characters */
13231 else {
13232 /* Map Unicode whitespace and control characters
13233 (categories Z* and C* except ASCII space)
13234 */
13235 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13236 PyUnicode_WRITE(okind, odata, o++, '\\');
13237 /* Map 8-bit characters to '\xhh' */
13238 if (ch <= 0xff) {
13239 PyUnicode_WRITE(okind, odata, o++, 'x');
13240 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13241 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13242 }
13243 /* Map 16-bit characters to '\uxxxx' */
13244 else if (ch <= 0xffff) {
13245 PyUnicode_WRITE(okind, odata, o++, 'u');
13246 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13247 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13248 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13249 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13250 }
13251 /* Map 21-bit characters to '\U00xxxxxx' */
13252 else {
13253 PyUnicode_WRITE(okind, odata, o++, 'U');
13254 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13255 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13256 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13257 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13258 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13259 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13260 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13261 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13262 }
13263 }
13264 /* Copy characters as-is */
13265 else {
13266 PyUnicode_WRITE(okind, odata, o++, ch);
13267 }
Georg Brandl559e5d72008-06-11 18:37:52 +000013268 }
13269 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000013270 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013271 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020013272 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000013273 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013274}
13275
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013276PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013277 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013278\n\
13279Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080013280such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013281arguments start and end are interpreted as in slice notation.\n\
13282\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013283Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013284
13285static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013286unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013287{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013288 /* initialize variables to prevent gcc warning */
13289 PyObject *substring = NULL;
13290 Py_ssize_t start = 0;
13291 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013292 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013293
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013294 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013295 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013296
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013297 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013298 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013299
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013300 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013301
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013302 if (result == -2)
13303 return NULL;
13304
Christian Heimes217cfd12007-12-02 14:31:20 +000013305 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013306}
13307
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013308PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013309 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013310\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070013311Return the highest index in S where substring sub is found,\n\
13312such that sub is contained within S[start:end]. Optional\n\
13313arguments start and end are interpreted as in slice notation.\n\
13314\n\
13315Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013316
13317static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013318unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013319{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013320 /* initialize variables to prevent gcc warning */
13321 PyObject *substring = NULL;
13322 Py_ssize_t start = 0;
13323 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013324 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013325
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013326 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013327 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013328
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013329 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013330 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013331
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013332 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013334 if (result == -2)
13335 return NULL;
13336
Guido van Rossumd57fd912000-03-10 22:53:23 +000013337 if (result < 0) {
13338 PyErr_SetString(PyExc_ValueError, "substring not found");
13339 return NULL;
13340 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013341
Christian Heimes217cfd12007-12-02 14:31:20 +000013342 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013343}
13344
INADA Naoki3ae20562017-01-16 20:41:20 +090013345/*[clinic input]
13346str.rjust as unicode_rjust
13347
13348 width: Py_ssize_t
13349 fillchar: Py_UCS4 = ' '
13350 /
13351
13352Return a right-justified string of length width.
13353
13354Padding is done using the specified fill character (default is a space).
13355[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013356
13357static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013358unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13359/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013360{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013361 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013362 return NULL;
13363
Victor Stinnerc4b49542011-12-11 22:44:26 +010013364 if (PyUnicode_GET_LENGTH(self) >= width)
13365 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013366
Victor Stinnerc4b49542011-12-11 22:44:26 +010013367 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013368}
13369
Alexander Belopolsky40018472011-02-26 01:02:56 +000013370PyObject *
13371PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013372{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013373 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013374 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013375
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013376 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013377}
13378
INADA Naoki3ae20562017-01-16 20:41:20 +090013379/*[clinic input]
13380str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013381
INADA Naoki3ae20562017-01-16 20:41:20 +090013382 sep: object = None
13383 The delimiter according which to split the string.
13384 None (the default value) means split according to any whitespace,
13385 and discard empty strings from the result.
13386 maxsplit: Py_ssize_t = -1
13387 Maximum number of splits to do.
13388 -1 (the default value) means no limit.
13389
13390Return a list of the words in the string, using sep as the delimiter string.
13391[clinic start generated code]*/
13392
13393static PyObject *
13394unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13395/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013396{
INADA Naoki3ae20562017-01-16 20:41:20 +090013397 if (sep == Py_None)
13398 return split(self, NULL, maxsplit);
13399 if (PyUnicode_Check(sep))
13400 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013401
Victor Stinner998b8062018-09-12 00:23:25 +020013402 PyErr_Format(PyExc_TypeError,
13403 "must be str or None, not %.100s",
13404 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013405 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013406}
13407
Thomas Wouters477c8d52006-05-27 19:21:47 +000013408PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013409PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013410{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013411 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013412 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013413 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013414 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013415
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013416 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013417 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013418
Victor Stinner14f8f022011-10-05 20:58:25 +020013419 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013420 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013421 len1 = PyUnicode_GET_LENGTH(str_obj);
13422 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013423 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013424 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013425 return PyTuple_Pack(3, str_obj, empty, empty);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013426 }
13427 buf1 = PyUnicode_DATA(str_obj);
13428 buf2 = PyUnicode_DATA(sep_obj);
13429 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013430 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013431 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013432 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013433 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013434
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013435 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013436 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013437 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13438 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13439 else
13440 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013441 break;
13442 case PyUnicode_2BYTE_KIND:
13443 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13444 break;
13445 case PyUnicode_4BYTE_KIND:
13446 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13447 break;
13448 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013449 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013450 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013451
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013452 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013453 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013454 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013455
13456 return out;
13457}
13458
13459
13460PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013461PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013462{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013463 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013464 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013465 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013466 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013467
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013468 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013469 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013470
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013471 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013472 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013473 len1 = PyUnicode_GET_LENGTH(str_obj);
13474 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013475 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013476 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013477 return PyTuple_Pack(3, empty, empty, str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013478 }
13479 buf1 = PyUnicode_DATA(str_obj);
13480 buf2 = PyUnicode_DATA(sep_obj);
13481 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013482 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013483 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013484 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013485 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013486
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013487 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013488 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013489 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13490 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13491 else
13492 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013493 break;
13494 case PyUnicode_2BYTE_KIND:
13495 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13496 break;
13497 case PyUnicode_4BYTE_KIND:
13498 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13499 break;
13500 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013501 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013502 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013503
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013504 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013505 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013506 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013507
13508 return out;
13509}
13510
INADA Naoki3ae20562017-01-16 20:41:20 +090013511/*[clinic input]
13512str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013513
INADA Naoki3ae20562017-01-16 20:41:20 +090013514 sep: object
13515 /
13516
13517Partition the string into three parts using the given separator.
13518
13519This will search for the separator in the string. If the separator is found,
13520returns a 3-tuple containing the part before the separator, the separator
13521itself, and the part after it.
13522
13523If the separator is not found, returns a 3-tuple containing the original string
13524and two empty strings.
13525[clinic start generated code]*/
13526
13527static PyObject *
13528unicode_partition(PyObject *self, PyObject *sep)
13529/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013530{
INADA Naoki3ae20562017-01-16 20:41:20 +090013531 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013532}
13533
INADA Naoki3ae20562017-01-16 20:41:20 +090013534/*[clinic input]
13535str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013536
INADA Naoki3ae20562017-01-16 20:41:20 +090013537Partition the string into three parts using the given separator.
13538
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013539This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013540the separator is found, returns a 3-tuple containing the part before the
13541separator, the separator itself, and the part after it.
13542
13543If the separator is not found, returns a 3-tuple containing two empty strings
13544and the original string.
13545[clinic start generated code]*/
13546
13547static PyObject *
13548unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013549/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013550{
INADA Naoki3ae20562017-01-16 20:41:20 +090013551 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013552}
13553
Alexander Belopolsky40018472011-02-26 01:02:56 +000013554PyObject *
13555PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013556{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013557 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013558 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013559
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013560 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013561}
13562
INADA Naoki3ae20562017-01-16 20:41:20 +090013563/*[clinic input]
13564str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013565
INADA Naoki3ae20562017-01-16 20:41:20 +090013566Return a list of the words in the string, using sep as the delimiter string.
13567
13568Splits are done starting at the end of the string and working to the front.
13569[clinic start generated code]*/
13570
13571static PyObject *
13572unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13573/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013574{
INADA Naoki3ae20562017-01-16 20:41:20 +090013575 if (sep == Py_None)
13576 return rsplit(self, NULL, maxsplit);
13577 if (PyUnicode_Check(sep))
13578 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013579
Victor Stinner998b8062018-09-12 00:23:25 +020013580 PyErr_Format(PyExc_TypeError,
13581 "must be str or None, not %.100s",
13582 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013583 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013584}
13585
INADA Naoki3ae20562017-01-16 20:41:20 +090013586/*[clinic input]
13587str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013588
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013589 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013590
13591Return a list of the lines in the string, breaking at line boundaries.
13592
13593Line breaks are not included in the resulting list unless keepends is given and
13594true.
13595[clinic start generated code]*/
13596
13597static PyObject *
13598unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013599/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013600{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013601 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013602}
13603
13604static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013605PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013606{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013607 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013608}
13609
INADA Naoki3ae20562017-01-16 20:41:20 +090013610/*[clinic input]
13611str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013612
INADA Naoki3ae20562017-01-16 20:41:20 +090013613Convert uppercase characters to lowercase and lowercase characters to uppercase.
13614[clinic start generated code]*/
13615
13616static PyObject *
13617unicode_swapcase_impl(PyObject *self)
13618/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013619{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013620 if (PyUnicode_READY(self) == -1)
13621 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013622 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013623}
13624
Larry Hastings61272b72014-01-07 12:41:53 -080013625/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013626
Larry Hastings31826802013-10-19 00:09:25 -070013627@staticmethod
13628str.maketrans as unicode_maketrans
13629
13630 x: object
13631
13632 y: unicode=NULL
13633
13634 z: unicode=NULL
13635
13636 /
13637
13638Return a translation table usable for str.translate().
13639
13640If there is only one argument, it must be a dictionary mapping Unicode
13641ordinals (integers) or characters to Unicode ordinals, strings or None.
13642Character keys will be then converted to ordinals.
13643If there are two arguments, they must be strings of equal length, and
13644in the resulting dictionary, each character in x will be mapped to the
13645character at the same position in y. If there is a third argument, it
13646must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013647[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013648
Larry Hastings31826802013-10-19 00:09:25 -070013649static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013650unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013651/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013652{
Georg Brandlceee0772007-11-27 23:48:05 +000013653 PyObject *new = NULL, *key, *value;
13654 Py_ssize_t i = 0;
13655 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013656
Georg Brandlceee0772007-11-27 23:48:05 +000013657 new = PyDict_New();
13658 if (!new)
13659 return NULL;
13660 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013661 int x_kind, y_kind, z_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013662 const void *x_data, *y_data, *z_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013663
Georg Brandlceee0772007-11-27 23:48:05 +000013664 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013665 if (!PyUnicode_Check(x)) {
13666 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13667 "be a string if there is a second argument");
13668 goto err;
13669 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013670 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013671 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13672 "arguments must have equal length");
13673 goto err;
13674 }
13675 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013676 x_kind = PyUnicode_KIND(x);
13677 y_kind = PyUnicode_KIND(y);
13678 x_data = PyUnicode_DATA(x);
13679 y_data = PyUnicode_DATA(y);
13680 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13681 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013682 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013683 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013684 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013685 if (!value) {
13686 Py_DECREF(key);
13687 goto err;
13688 }
Georg Brandlceee0772007-11-27 23:48:05 +000013689 res = PyDict_SetItem(new, key, value);
13690 Py_DECREF(key);
13691 Py_DECREF(value);
13692 if (res < 0)
13693 goto err;
13694 }
13695 /* create entries for deleting chars in z */
13696 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013697 z_kind = PyUnicode_KIND(z);
13698 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013699 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013700 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013701 if (!key)
13702 goto err;
13703 res = PyDict_SetItem(new, key, Py_None);
13704 Py_DECREF(key);
13705 if (res < 0)
13706 goto err;
13707 }
13708 }
13709 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013710 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013711 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013712
Georg Brandlceee0772007-11-27 23:48:05 +000013713 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013714 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013715 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13716 "to maketrans it must be a dict");
13717 goto err;
13718 }
13719 /* copy entries into the new dict, converting string keys to int keys */
13720 while (PyDict_Next(x, &i, &key, &value)) {
13721 if (PyUnicode_Check(key)) {
13722 /* convert string keys to integer keys */
13723 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013724 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013725 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13726 "table must be of length 1");
13727 goto err;
13728 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013729 kind = PyUnicode_KIND(key);
13730 data = PyUnicode_DATA(key);
13731 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013732 if (!newkey)
13733 goto err;
13734 res = PyDict_SetItem(new, newkey, value);
13735 Py_DECREF(newkey);
13736 if (res < 0)
13737 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013738 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013739 /* just keep integer keys */
13740 if (PyDict_SetItem(new, key, value) < 0)
13741 goto err;
13742 } else {
13743 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13744 "be strings or integers");
13745 goto err;
13746 }
13747 }
13748 }
13749 return new;
13750 err:
13751 Py_DECREF(new);
13752 return NULL;
13753}
13754
INADA Naoki3ae20562017-01-16 20:41:20 +090013755/*[clinic input]
13756str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013757
INADA Naoki3ae20562017-01-16 20:41:20 +090013758 table: object
13759 Translation table, which must be a mapping of Unicode ordinals to
13760 Unicode ordinals, strings, or None.
13761 /
13762
13763Replace each character in the string using the given translation table.
13764
13765The table must implement lookup/indexing via __getitem__, for instance a
13766dictionary or list. If this operation raises LookupError, the character is
13767left untouched. Characters mapped to None are deleted.
13768[clinic start generated code]*/
13769
13770static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013771unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013772/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013773{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013774 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013775}
13776
INADA Naoki3ae20562017-01-16 20:41:20 +090013777/*[clinic input]
13778str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013779
INADA Naoki3ae20562017-01-16 20:41:20 +090013780Return a copy of the string converted to uppercase.
13781[clinic start generated code]*/
13782
13783static PyObject *
13784unicode_upper_impl(PyObject *self)
13785/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013786{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013787 if (PyUnicode_READY(self) == -1)
13788 return NULL;
13789 if (PyUnicode_IS_ASCII(self))
13790 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013791 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013792}
13793
INADA Naoki3ae20562017-01-16 20:41:20 +090013794/*[clinic input]
13795str.zfill as unicode_zfill
13796
13797 width: Py_ssize_t
13798 /
13799
13800Pad a numeric string with zeros on the left, to fill a field of the given width.
13801
13802The string is never truncated.
13803[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013804
13805static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013806unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013807/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013808{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013809 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013810 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013811 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013812 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013813 Py_UCS4 chr;
13814
Benjamin Petersonbac79492012-01-14 13:34:47 -050013815 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013816 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013817
Victor Stinnerc4b49542011-12-11 22:44:26 +010013818 if (PyUnicode_GET_LENGTH(self) >= width)
13819 return unicode_result_unchanged(self);
13820
13821 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013822
13823 u = pad(self, fill, 0, '0');
13824
Walter Dörwald068325e2002-04-15 13:36:47 +000013825 if (u == NULL)
13826 return NULL;
13827
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013828 kind = PyUnicode_KIND(u);
13829 data = PyUnicode_DATA(u);
13830 chr = PyUnicode_READ(kind, data, fill);
13831
13832 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013833 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013834 PyUnicode_WRITE(kind, data, 0, chr);
13835 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013836 }
13837
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013838 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013839 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013840}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013841
13842#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013843static PyObject *
13844unicode__decimal2ascii(PyObject *self)
13845{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013846 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013847}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013848#endif
13849
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013850PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013851 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013852\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013853Return True if S starts with the specified prefix, False otherwise.\n\
13854With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013855With optional end, stop comparing S at that position.\n\
13856prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013857
13858static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013859unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013860 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013861{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013862 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013863 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013864 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013865 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013866 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013867
Jesus Ceaac451502011-04-20 17:09:23 +020013868 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013869 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013870 if (PyTuple_Check(subobj)) {
13871 Py_ssize_t i;
13872 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013873 substring = PyTuple_GET_ITEM(subobj, i);
13874 if (!PyUnicode_Check(substring)) {
13875 PyErr_Format(PyExc_TypeError,
13876 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013877 "not %.100s",
13878 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013879 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013880 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013881 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013882 if (result == -1)
13883 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013884 if (result) {
13885 Py_RETURN_TRUE;
13886 }
13887 }
13888 /* nothing matched */
13889 Py_RETURN_FALSE;
13890 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013891 if (!PyUnicode_Check(subobj)) {
13892 PyErr_Format(PyExc_TypeError,
13893 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013894 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013895 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013896 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013897 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013898 if (result == -1)
13899 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013900 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013901}
13902
13903
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013904PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013905 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013906\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013907Return True if S ends with the specified suffix, False otherwise.\n\
13908With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013909With optional end, stop comparing S at that position.\n\
13910suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013911
13912static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013913unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013914 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013915{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013916 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013917 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013918 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013919 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013920 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013921
Jesus Ceaac451502011-04-20 17:09:23 +020013922 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013923 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013924 if (PyTuple_Check(subobj)) {
13925 Py_ssize_t i;
13926 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013927 substring = PyTuple_GET_ITEM(subobj, i);
13928 if (!PyUnicode_Check(substring)) {
13929 PyErr_Format(PyExc_TypeError,
13930 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013931 "not %.100s",
13932 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013933 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013934 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013935 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013936 if (result == -1)
13937 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013938 if (result) {
13939 Py_RETURN_TRUE;
13940 }
13941 }
13942 Py_RETURN_FALSE;
13943 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013944 if (!PyUnicode_Check(subobj)) {
13945 PyErr_Format(PyExc_TypeError,
13946 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013947 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013948 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013949 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013950 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013951 if (result == -1)
13952 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013953 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013954}
13955
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013956static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013957_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013958{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013959 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13960 writer->data = PyUnicode_DATA(writer->buffer);
13961
13962 if (!writer->readonly) {
13963 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013964 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013965 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013966 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013967 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13968 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13969 writer->kind = PyUnicode_WCHAR_KIND;
13970 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13971
Victor Stinner8f674cc2013-04-17 23:02:17 +020013972 /* Copy-on-write mode: set buffer size to 0 so
13973 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13974 * next write. */
13975 writer->size = 0;
13976 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013977}
13978
Victor Stinnerd3f08822012-05-29 12:57:52 +020013979void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013980_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013981{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013982 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013983
13984 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013985 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013986
13987 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13988 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13989 writer->kind = PyUnicode_WCHAR_KIND;
13990 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013991}
13992
Inada Naoki770847a2019-06-24 12:30:24 +090013993// Initialize _PyUnicodeWriter with initial buffer
13994static inline void
13995_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13996{
13997 memset(writer, 0, sizeof(*writer));
13998 writer->buffer = buffer;
13999 _PyUnicodeWriter_Update(writer);
14000 writer->min_length = writer->size;
14001}
14002
Victor Stinnerd3f08822012-05-29 12:57:52 +020014003int
14004_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
14005 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020014006{
14007 Py_ssize_t newlen;
14008 PyObject *newbuffer;
14009
Victor Stinner2740e462016-09-06 16:58:36 -070014010 assert(maxchar <= MAX_UNICODE);
14011
Victor Stinnerca9381e2015-09-22 00:58:32 +020014012 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020014013 assert((maxchar > writer->maxchar && length >= 0)
14014 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014015
Victor Stinner202fdca2012-05-07 12:47:02 +020014016 if (length > PY_SSIZE_T_MAX - writer->pos) {
14017 PyErr_NoMemory();
14018 return -1;
14019 }
14020 newlen = writer->pos + length;
14021
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014022 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020014023
Victor Stinnerd3f08822012-05-29 12:57:52 +020014024 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020014025 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010014026 if (writer->overallocate
14027 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
14028 /* overallocate to limit the number of realloc() */
14029 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014030 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014031 if (newlen < writer->min_length)
14032 newlen = writer->min_length;
14033
Victor Stinnerd3f08822012-05-29 12:57:52 +020014034 writer->buffer = PyUnicode_New(newlen, maxchar);
14035 if (writer->buffer == NULL)
14036 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014037 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014038 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010014039 if (writer->overallocate
14040 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
14041 /* overallocate to limit the number of realloc() */
14042 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014043 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014044 if (newlen < writer->min_length)
14045 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014046
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014047 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020014048 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030014049 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020014050 newbuffer = PyUnicode_New(newlen, maxchar);
14051 if (newbuffer == NULL)
14052 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014053 _PyUnicode_FastCopyCharacters(newbuffer, 0,
14054 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020014055 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014056 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020014057 }
14058 else {
14059 newbuffer = resize_compact(writer->buffer, newlen);
14060 if (newbuffer == NULL)
14061 return -1;
14062 }
14063 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020014064 }
14065 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014066 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014067 newbuffer = PyUnicode_New(writer->size, maxchar);
14068 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020014069 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014070 _PyUnicode_FastCopyCharacters(newbuffer, 0,
14071 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030014072 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020014073 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014074 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020014075 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010014076
14077#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020014078}
14079
Victor Stinnerca9381e2015-09-22 00:58:32 +020014080int
14081_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
14082 enum PyUnicode_Kind kind)
14083{
14084 Py_UCS4 maxchar;
14085
14086 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
14087 assert(writer->kind < kind);
14088
14089 switch (kind)
14090 {
14091 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
14092 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
14093 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
14094 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014095 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020014096 }
14097
14098 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
14099}
14100
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070014101static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014102_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020014103{
Victor Stinner2740e462016-09-06 16:58:36 -070014104 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020014105 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
14106 return -1;
14107 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
14108 writer->pos++;
14109 return 0;
14110}
14111
14112int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014113_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
14114{
14115 return _PyUnicodeWriter_WriteCharInline(writer, ch);
14116}
14117
14118int
Victor Stinnerd3f08822012-05-29 12:57:52 +020014119_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
14120{
14121 Py_UCS4 maxchar;
14122 Py_ssize_t len;
14123
14124 if (PyUnicode_READY(str) == -1)
14125 return -1;
14126 len = PyUnicode_GET_LENGTH(str);
14127 if (len == 0)
14128 return 0;
14129 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
14130 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014131 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010014132 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020014133 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014134 Py_INCREF(str);
14135 writer->buffer = str;
14136 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014137 writer->pos += len;
14138 return 0;
14139 }
14140 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
14141 return -1;
14142 }
14143 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14144 str, 0, len);
14145 writer->pos += len;
14146 return 0;
14147}
14148
Victor Stinnere215d962012-10-06 23:03:36 +020014149int
Victor Stinnercfc4c132013-04-03 01:48:39 +020014150_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
14151 Py_ssize_t start, Py_ssize_t end)
14152{
14153 Py_UCS4 maxchar;
14154 Py_ssize_t len;
14155
14156 if (PyUnicode_READY(str) == -1)
14157 return -1;
14158
14159 assert(0 <= start);
14160 assert(end <= PyUnicode_GET_LENGTH(str));
14161 assert(start <= end);
14162
14163 if (end == 0)
14164 return 0;
14165
14166 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
14167 return _PyUnicodeWriter_WriteStr(writer, str);
14168
14169 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
14170 maxchar = _PyUnicode_FindMaxChar(str, start, end);
14171 else
14172 maxchar = writer->maxchar;
14173 len = end - start;
14174
14175 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
14176 return -1;
14177
14178 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14179 str, start, len);
14180 writer->pos += len;
14181 return 0;
14182}
14183
14184int
Victor Stinner4a587072013-11-19 12:54:53 +010014185_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14186 const char *ascii, Py_ssize_t len)
14187{
14188 if (len == -1)
14189 len = strlen(ascii);
14190
Andy Lestere6be9b52020-02-11 20:28:35 -060014191 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010014192
14193 if (writer->buffer == NULL && !writer->overallocate) {
14194 PyObject *str;
14195
14196 str = _PyUnicode_FromASCII(ascii, len);
14197 if (str == NULL)
14198 return -1;
14199
14200 writer->readonly = 1;
14201 writer->buffer = str;
14202 _PyUnicodeWriter_Update(writer);
14203 writer->pos += len;
14204 return 0;
14205 }
14206
14207 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14208 return -1;
14209
14210 switch (writer->kind)
14211 {
14212 case PyUnicode_1BYTE_KIND:
14213 {
14214 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14215 Py_UCS1 *data = writer->data;
14216
Christian Heimesf051e432016-09-13 20:22:02 +020014217 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010014218 break;
14219 }
14220 case PyUnicode_2BYTE_KIND:
14221 {
14222 _PyUnicode_CONVERT_BYTES(
14223 Py_UCS1, Py_UCS2,
14224 ascii, ascii + len,
14225 (Py_UCS2 *)writer->data + writer->pos);
14226 break;
14227 }
14228 case PyUnicode_4BYTE_KIND:
14229 {
14230 _PyUnicode_CONVERT_BYTES(
14231 Py_UCS1, Py_UCS4,
14232 ascii, ascii + len,
14233 (Py_UCS4 *)writer->data + writer->pos);
14234 break;
14235 }
14236 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014237 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010014238 }
14239
14240 writer->pos += len;
14241 return 0;
14242}
14243
14244int
14245_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14246 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020014247{
14248 Py_UCS4 maxchar;
14249
Andy Lestere6be9b52020-02-11 20:28:35 -060014250 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020014251 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14252 return -1;
14253 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14254 writer->pos += len;
14255 return 0;
14256}
14257
Victor Stinnerd3f08822012-05-29 12:57:52 +020014258PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014259_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014260{
Victor Stinner15a0bd32013-07-08 22:29:55 +020014261 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014262
Victor Stinnerd3f08822012-05-29 12:57:52 +020014263 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014264 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020014265 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020014266 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014267
14268 str = writer->buffer;
14269 writer->buffer = NULL;
14270
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014271 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014272 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14273 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014274 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014275
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014276 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14277 PyObject *str2;
14278 str2 = resize_compact(str, writer->pos);
14279 if (str2 == NULL) {
14280 Py_DECREF(str);
14281 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014282 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014283 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014284 }
14285
Victor Stinner15a0bd32013-07-08 22:29:55 +020014286 assert(_PyUnicode_CheckConsistency(str, 1));
14287 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020014288}
14289
Victor Stinnerd3f08822012-05-29 12:57:52 +020014290void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014291_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014292{
14293 Py_CLEAR(writer->buffer);
14294}
14295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014296#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000014297
14298PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000014299 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000014300\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014301Return a formatted version of S, using substitutions from args and kwargs.\n\
14302The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000014303
Eric Smith27bbca62010-11-04 17:06:58 +000014304PyDoc_STRVAR(format_map__doc__,
14305 "S.format_map(mapping) -> str\n\
14306\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014307Return a formatted version of S, using substitutions from mapping.\n\
14308The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000014309
INADA Naoki3ae20562017-01-16 20:41:20 +090014310/*[clinic input]
14311str.__format__ as unicode___format__
14312
14313 format_spec: unicode
14314 /
14315
14316Return a formatted version of the string as described by format_spec.
14317[clinic start generated code]*/
14318
Eric Smith4a7d76d2008-05-30 18:10:19 +000014319static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014320unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090014321/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000014322{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014323 _PyUnicodeWriter writer;
14324 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000014325
Victor Stinnerd3f08822012-05-29 12:57:52 +020014326 if (PyUnicode_READY(self) == -1)
14327 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020014328 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014329 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14330 self, format_spec, 0,
14331 PyUnicode_GET_LENGTH(format_spec));
14332 if (ret == -1) {
14333 _PyUnicodeWriter_Dealloc(&writer);
14334 return NULL;
14335 }
14336 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000014337}
14338
INADA Naoki3ae20562017-01-16 20:41:20 +090014339/*[clinic input]
14340str.__sizeof__ as unicode_sizeof
14341
14342Return the size of the string in memory, in bytes.
14343[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014344
14345static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014346unicode_sizeof_impl(PyObject *self)
14347/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014348{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014349 Py_ssize_t size;
14350
14351 /* If it's a compact object, account for base structure +
14352 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014353 if (PyUnicode_IS_COMPACT_ASCII(self))
14354 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14355 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014356 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014357 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014358 else {
14359 /* If it is a two-block object, account for base object, and
14360 for character block if present. */
14361 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014362 if (_PyUnicode_DATA_ANY(self))
14363 size += (PyUnicode_GET_LENGTH(self) + 1) *
14364 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014365 }
14366 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014367 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014368 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14369 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14370 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14371 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014372
14373 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014374}
14375
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014376static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014377unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014378{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014379 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014380 if (!copy)
14381 return NULL;
14382 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014383}
14384
Guido van Rossumd57fd912000-03-10 22:53:23 +000014385static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014386 UNICODE_ENCODE_METHODDEF
14387 UNICODE_REPLACE_METHODDEF
14388 UNICODE_SPLIT_METHODDEF
14389 UNICODE_RSPLIT_METHODDEF
14390 UNICODE_JOIN_METHODDEF
14391 UNICODE_CAPITALIZE_METHODDEF
14392 UNICODE_CASEFOLD_METHODDEF
14393 UNICODE_TITLE_METHODDEF
14394 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014395 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014396 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014397 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014398 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014399 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014400 UNICODE_LJUST_METHODDEF
14401 UNICODE_LOWER_METHODDEF
14402 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014403 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14404 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014405 UNICODE_RJUST_METHODDEF
14406 UNICODE_RSTRIP_METHODDEF
14407 UNICODE_RPARTITION_METHODDEF
14408 UNICODE_SPLITLINES_METHODDEF
14409 UNICODE_STRIP_METHODDEF
14410 UNICODE_SWAPCASE_METHODDEF
14411 UNICODE_TRANSLATE_METHODDEF
14412 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014413 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14414 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
sweeneydea81849b2020-04-22 17:05:48 -040014415 UNICODE_REMOVEPREFIX_METHODDEF
14416 UNICODE_REMOVESUFFIX_METHODDEF
INADA Naokia49ac992018-01-27 14:06:21 +090014417 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014418 UNICODE_ISLOWER_METHODDEF
14419 UNICODE_ISUPPER_METHODDEF
14420 UNICODE_ISTITLE_METHODDEF
14421 UNICODE_ISSPACE_METHODDEF
14422 UNICODE_ISDECIMAL_METHODDEF
14423 UNICODE_ISDIGIT_METHODDEF
14424 UNICODE_ISNUMERIC_METHODDEF
14425 UNICODE_ISALPHA_METHODDEF
14426 UNICODE_ISALNUM_METHODDEF
14427 UNICODE_ISIDENTIFIER_METHODDEF
14428 UNICODE_ISPRINTABLE_METHODDEF
14429 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014430 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014431 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014432 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014433 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014434 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014435#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014436 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014437 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014438#endif
14439
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014440 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014441 {NULL, NULL}
14442};
14443
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014444static PyObject *
14445unicode_mod(PyObject *v, PyObject *w)
14446{
Brian Curtindfc80e32011-08-10 20:28:54 -050014447 if (!PyUnicode_Check(v))
14448 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014449 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014450}
14451
14452static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014453 0, /*nb_add*/
14454 0, /*nb_subtract*/
14455 0, /*nb_multiply*/
14456 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014457};
14458
Guido van Rossumd57fd912000-03-10 22:53:23 +000014459static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014460 (lenfunc) unicode_length, /* sq_length */
14461 PyUnicode_Concat, /* sq_concat */
14462 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14463 (ssizeargfunc) unicode_getitem, /* sq_item */
14464 0, /* sq_slice */
14465 0, /* sq_ass_item */
14466 0, /* sq_ass_slice */
14467 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014468};
14469
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014470static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014471unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014472{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014473 if (PyUnicode_READY(self) == -1)
14474 return NULL;
14475
Victor Stinnera15e2602020-04-08 02:01:56 +020014476 if (_PyIndex_Check(item)) {
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014477 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014478 if (i == -1 && PyErr_Occurred())
14479 return NULL;
14480 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014481 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014482 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014483 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014484 Py_ssize_t start, stop, step, slicelength, i;
14485 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014486 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014487 const void *src_data;
14488 void *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014489 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014490 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014491
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014492 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014493 return NULL;
14494 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014495 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14496 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014497
14498 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014499 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014500 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014501 slicelength == PyUnicode_GET_LENGTH(self)) {
14502 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014503 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014504 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014505 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014506 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014507 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014508 src_kind = PyUnicode_KIND(self);
14509 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014510 if (!PyUnicode_IS_ASCII(self)) {
14511 kind_limit = kind_maxchar_limit(src_kind);
14512 max_char = 0;
14513 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14514 ch = PyUnicode_READ(src_kind, src_data, cur);
14515 if (ch > max_char) {
14516 max_char = ch;
14517 if (max_char >= kind_limit)
14518 break;
14519 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014520 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014521 }
Victor Stinner55c99112011-10-13 01:17:06 +020014522 else
14523 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014524 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014525 if (result == NULL)
14526 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014527 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014528 dest_data = PyUnicode_DATA(result);
14529
14530 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014531 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14532 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014533 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014534 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014535 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014536 } else {
14537 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14538 return NULL;
14539 }
14540}
14541
14542static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014543 (lenfunc)unicode_length, /* mp_length */
14544 (binaryfunc)unicode_subscript, /* mp_subscript */
14545 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014546};
14547
Guido van Rossumd57fd912000-03-10 22:53:23 +000014548
Guido van Rossumd57fd912000-03-10 22:53:23 +000014549/* Helpers for PyUnicode_Format() */
14550
Victor Stinnera47082312012-10-04 02:19:54 +020014551struct unicode_formatter_t {
14552 PyObject *args;
14553 int args_owned;
14554 Py_ssize_t arglen, argidx;
14555 PyObject *dict;
14556
14557 enum PyUnicode_Kind fmtkind;
14558 Py_ssize_t fmtcnt, fmtpos;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014559 const void *fmtdata;
Victor Stinnera47082312012-10-04 02:19:54 +020014560 PyObject *fmtstr;
14561
14562 _PyUnicodeWriter writer;
14563};
14564
14565struct unicode_format_arg_t {
14566 Py_UCS4 ch;
14567 int flags;
14568 Py_ssize_t width;
14569 int prec;
14570 int sign;
14571};
14572
Guido van Rossumd57fd912000-03-10 22:53:23 +000014573static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014574unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014575{
Victor Stinnera47082312012-10-04 02:19:54 +020014576 Py_ssize_t argidx = ctx->argidx;
14577
14578 if (argidx < ctx->arglen) {
14579 ctx->argidx++;
14580 if (ctx->arglen < 0)
14581 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014582 else
Victor Stinnera47082312012-10-04 02:19:54 +020014583 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014584 }
14585 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014586 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014587 return NULL;
14588}
14589
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014590/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014591
Victor Stinnera47082312012-10-04 02:19:54 +020014592/* Format a float into the writer if the writer is not NULL, or into *p_output
14593 otherwise.
14594
14595 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014596static int
Victor Stinnera47082312012-10-04 02:19:54 +020014597formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14598 PyObject **p_output,
14599 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014600{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014601 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014602 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014603 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014604 int prec;
14605 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014606
Guido van Rossumd57fd912000-03-10 22:53:23 +000014607 x = PyFloat_AsDouble(v);
14608 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014609 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014610
Victor Stinnera47082312012-10-04 02:19:54 +020014611 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014612 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014613 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014614
Victor Stinnera47082312012-10-04 02:19:54 +020014615 if (arg->flags & F_ALT)
14616 dtoa_flags = Py_DTSF_ALT;
14617 else
14618 dtoa_flags = 0;
14619 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014620 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014621 return -1;
14622 len = strlen(p);
14623 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014624 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014625 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014626 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014627 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014628 }
14629 else
14630 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014631 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014632 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014633}
14634
Victor Stinnerd0880d52012-04-27 23:40:13 +020014635/* formatlong() emulates the format codes d, u, o, x and X, and
14636 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14637 * Python's regular ints.
14638 * Return value: a new PyUnicodeObject*, or NULL if error.
14639 * The output string is of the form
14640 * "-"? ("0x" | "0X")? digit+
14641 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14642 * set in flags. The case of hex digits will be correct,
14643 * There will be at least prec digits, zero-filled on the left if
14644 * necessary to get that many.
14645 * val object to be converted
14646 * flags bitmask of format flags; only F_ALT is looked at
14647 * prec minimum number of digits; 0-fill on left if needed
14648 * type a character in [duoxX]; u acts the same as d
14649 *
14650 * CAUTION: o, x and X conversions on regular ints can never
14651 * produce a '-' sign, but can for Python's unbounded ints.
14652 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014653PyObject *
14654_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014655{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014656 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014657 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014658 Py_ssize_t i;
14659 int sign; /* 1 if '-', else 0 */
14660 int len; /* number of characters */
14661 Py_ssize_t llen;
14662 int numdigits; /* len == numnondigits + numdigits */
14663 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014664
Victor Stinnerd0880d52012-04-27 23:40:13 +020014665 /* Avoid exceeding SSIZE_T_MAX */
14666 if (prec > INT_MAX-3) {
14667 PyErr_SetString(PyExc_OverflowError,
14668 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014669 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014670 }
14671
14672 assert(PyLong_Check(val));
14673
14674 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014675 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014676 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014677 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014678 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014679 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014680 /* int and int subclasses should print numerically when a numeric */
14681 /* format code is used (see issue18780) */
14682 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014683 break;
14684 case 'o':
14685 numnondigits = 2;
14686 result = PyNumber_ToBase(val, 8);
14687 break;
14688 case 'x':
14689 case 'X':
14690 numnondigits = 2;
14691 result = PyNumber_ToBase(val, 16);
14692 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014693 }
14694 if (!result)
14695 return NULL;
14696
14697 assert(unicode_modifiable(result));
14698 assert(PyUnicode_IS_READY(result));
14699 assert(PyUnicode_IS_ASCII(result));
14700
14701 /* To modify the string in-place, there can only be one reference. */
14702 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014703 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014704 PyErr_BadInternalCall();
14705 return NULL;
14706 }
14707 buf = PyUnicode_DATA(result);
14708 llen = PyUnicode_GET_LENGTH(result);
14709 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014710 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014711 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014712 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014713 return NULL;
14714 }
14715 len = (int)llen;
14716 sign = buf[0] == '-';
14717 numnondigits += sign;
14718 numdigits = len - numnondigits;
14719 assert(numdigits > 0);
14720
14721 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014722 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014723 (type == 'o' || type == 'x' || type == 'X'))) {
14724 assert(buf[sign] == '0');
14725 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14726 buf[sign+1] == 'o');
14727 numnondigits -= 2;
14728 buf += 2;
14729 len -= 2;
14730 if (sign)
14731 buf[0] = '-';
14732 assert(len == numnondigits + numdigits);
14733 assert(numdigits > 0);
14734 }
14735
14736 /* Fill with leading zeroes to meet minimum width. */
14737 if (prec > numdigits) {
14738 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14739 numnondigits + prec);
14740 char *b1;
14741 if (!r1) {
14742 Py_DECREF(result);
14743 return NULL;
14744 }
14745 b1 = PyBytes_AS_STRING(r1);
14746 for (i = 0; i < numnondigits; ++i)
14747 *b1++ = *buf++;
14748 for (i = 0; i < prec - numdigits; i++)
14749 *b1++ = '0';
14750 for (i = 0; i < numdigits; i++)
14751 *b1++ = *buf++;
14752 *b1 = '\0';
14753 Py_DECREF(result);
14754 result = r1;
14755 buf = PyBytes_AS_STRING(result);
14756 len = numnondigits + prec;
14757 }
14758
14759 /* Fix up case for hex conversions. */
14760 if (type == 'X') {
14761 /* Need to convert all lower case letters to upper case.
14762 and need to convert 0x to 0X (and -0x to -0X). */
14763 for (i = 0; i < len; i++)
14764 if (buf[i] >= 'a' && buf[i] <= 'x')
14765 buf[i] -= 'a'-'A';
14766 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014767 if (!PyUnicode_Check(result)
14768 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014769 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014770 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014771 Py_DECREF(result);
14772 result = unicode;
14773 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014774 else if (len != PyUnicode_GET_LENGTH(result)) {
14775 if (PyUnicode_Resize(&result, len) < 0)
14776 Py_CLEAR(result);
14777 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014778 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014779}
14780
Ethan Furmandf3ed242014-01-05 06:50:30 -080014781/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014782 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014783 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014784 * -1 and raise an exception on error */
14785static int
Victor Stinnera47082312012-10-04 02:19:54 +020014786mainformatlong(PyObject *v,
14787 struct unicode_format_arg_t *arg,
14788 PyObject **p_output,
14789 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014790{
14791 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014792 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014793
14794 if (!PyNumber_Check(v))
14795 goto wrongtype;
14796
Ethan Furman9ab74802014-03-21 06:38:46 -070014797 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014798 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014799 if (type == 'o' || type == 'x' || type == 'X') {
Serhiy Storchaka5f4b229d2020-05-28 10:33:45 +030014800 iobj = _PyNumber_Index(v);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014801 }
14802 else {
14803 iobj = PyNumber_Long(v);
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014804 }
14805 if (iobj == NULL ) {
14806 if (PyErr_ExceptionMatches(PyExc_TypeError))
14807 goto wrongtype;
14808 return -1;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014809 }
14810 assert(PyLong_Check(iobj));
14811 }
14812 else {
14813 iobj = v;
14814 Py_INCREF(iobj);
14815 }
14816
14817 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014818 && arg->width == -1 && arg->prec == -1
14819 && !(arg->flags & (F_SIGN | F_BLANK))
14820 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014821 {
14822 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014823 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014824 int base;
14825
Victor Stinnera47082312012-10-04 02:19:54 +020014826 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014827 {
14828 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014829 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014830 case 'd':
14831 case 'i':
14832 case 'u':
14833 base = 10;
14834 break;
14835 case 'o':
14836 base = 8;
14837 break;
14838 case 'x':
14839 case 'X':
14840 base = 16;
14841 break;
14842 }
14843
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014844 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14845 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014846 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014847 }
14848 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014849 return 1;
14850 }
14851
Ethan Furmanb95b5612015-01-23 20:05:18 -080014852 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014853 Py_DECREF(iobj);
14854 if (res == NULL)
14855 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014856 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014857 return 0;
14858
14859wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014860 switch(type)
14861 {
14862 case 'o':
14863 case 'x':
14864 case 'X':
14865 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014866 "%%%c format: an integer is required, "
14867 "not %.200s",
14868 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014869 break;
14870 default:
14871 PyErr_Format(PyExc_TypeError,
Serhiy Storchakae2ec0b22020-10-09 14:14:37 +030014872 "%%%c format: a real number is required, "
Victor Stinner998b8062018-09-12 00:23:25 +020014873 "not %.200s",
14874 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014875 break;
14876 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014877 return -1;
14878}
14879
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014880static Py_UCS4
14881formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014882{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014883 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014884 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014885 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014886 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014887 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014888 goto onError;
14889 }
14890 else {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014891 int overflow;
14892 long x = PyLong_AsLongAndOverflow(v, &overflow);
14893 if (x == -1 && PyErr_Occurred()) {
14894 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014895 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014896 }
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014897 return (Py_UCS4) -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014898 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014899
Victor Stinner8faf8212011-12-08 22:14:11 +010014900 if (x < 0 || x > MAX_UNICODE) {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014901 /* this includes an overflow in converting to C long */
Benjamin Peterson29060642009-01-31 22:14:21 +000014902 PyErr_SetString(PyExc_OverflowError,
14903 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014904 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014905 }
14906
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014907 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014908 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014909
Benjamin Peterson29060642009-01-31 22:14:21 +000014910 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014911 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014912 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014913 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014914}
14915
Victor Stinnera47082312012-10-04 02:19:54 +020014916/* Parse options of an argument: flags, width, precision.
14917 Handle also "%(name)" syntax.
14918
14919 Return 0 if the argument has been formatted into arg->str.
14920 Return 1 if the argument has been written into ctx->writer,
14921 Raise an exception and return -1 on error. */
14922static int
14923unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14924 struct unicode_format_arg_t *arg)
14925{
14926#define FORMAT_READ(ctx) \
14927 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14928
14929 PyObject *v;
14930
Victor Stinnera47082312012-10-04 02:19:54 +020014931 if (arg->ch == '(') {
14932 /* Get argument value from a dictionary. Example: "%(name)s". */
14933 Py_ssize_t keystart;
14934 Py_ssize_t keylen;
14935 PyObject *key;
14936 int pcount = 1;
14937
14938 if (ctx->dict == NULL) {
14939 PyErr_SetString(PyExc_TypeError,
14940 "format requires a mapping");
14941 return -1;
14942 }
14943 ++ctx->fmtpos;
14944 --ctx->fmtcnt;
14945 keystart = ctx->fmtpos;
14946 /* Skip over balanced parentheses */
14947 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14948 arg->ch = FORMAT_READ(ctx);
14949 if (arg->ch == ')')
14950 --pcount;
14951 else if (arg->ch == '(')
14952 ++pcount;
14953 ctx->fmtpos++;
14954 }
14955 keylen = ctx->fmtpos - keystart - 1;
14956 if (ctx->fmtcnt < 0 || pcount > 0) {
14957 PyErr_SetString(PyExc_ValueError,
14958 "incomplete format key");
14959 return -1;
14960 }
14961 key = PyUnicode_Substring(ctx->fmtstr,
14962 keystart, keystart + keylen);
14963 if (key == NULL)
14964 return -1;
14965 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014966 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014967 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014968 }
14969 ctx->args = PyObject_GetItem(ctx->dict, key);
14970 Py_DECREF(key);
14971 if (ctx->args == NULL)
14972 return -1;
14973 ctx->args_owned = 1;
14974 ctx->arglen = -1;
14975 ctx->argidx = -2;
14976 }
14977
14978 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014979 while (--ctx->fmtcnt >= 0) {
14980 arg->ch = FORMAT_READ(ctx);
14981 ctx->fmtpos++;
14982 switch (arg->ch) {
14983 case '-': arg->flags |= F_LJUST; continue;
14984 case '+': arg->flags |= F_SIGN; continue;
14985 case ' ': arg->flags |= F_BLANK; continue;
14986 case '#': arg->flags |= F_ALT; continue;
14987 case '0': arg->flags |= F_ZERO; continue;
14988 }
14989 break;
14990 }
14991
14992 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014993 if (arg->ch == '*') {
14994 v = unicode_format_getnextarg(ctx);
14995 if (v == NULL)
14996 return -1;
14997 if (!PyLong_Check(v)) {
14998 PyErr_SetString(PyExc_TypeError,
14999 "* wants int");
15000 return -1;
15001 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020015002 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020015003 if (arg->width == -1 && PyErr_Occurred())
15004 return -1;
15005 if (arg->width < 0) {
15006 arg->flags |= F_LJUST;
15007 arg->width = -arg->width;
15008 }
15009 if (--ctx->fmtcnt >= 0) {
15010 arg->ch = FORMAT_READ(ctx);
15011 ctx->fmtpos++;
15012 }
15013 }
15014 else if (arg->ch >= '0' && arg->ch <= '9') {
15015 arg->width = arg->ch - '0';
15016 while (--ctx->fmtcnt >= 0) {
15017 arg->ch = FORMAT_READ(ctx);
15018 ctx->fmtpos++;
15019 if (arg->ch < '0' || arg->ch > '9')
15020 break;
15021 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
15022 mixing signed and unsigned comparison. Since arg->ch is between
15023 '0' and '9', casting to int is safe. */
15024 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
15025 PyErr_SetString(PyExc_ValueError,
15026 "width too big");
15027 return -1;
15028 }
15029 arg->width = arg->width*10 + (arg->ch - '0');
15030 }
15031 }
15032
15033 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020015034 if (arg->ch == '.') {
15035 arg->prec = 0;
15036 if (--ctx->fmtcnt >= 0) {
15037 arg->ch = FORMAT_READ(ctx);
15038 ctx->fmtpos++;
15039 }
15040 if (arg->ch == '*') {
15041 v = unicode_format_getnextarg(ctx);
15042 if (v == NULL)
15043 return -1;
15044 if (!PyLong_Check(v)) {
15045 PyErr_SetString(PyExc_TypeError,
15046 "* wants int");
15047 return -1;
15048 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020015049 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020015050 if (arg->prec == -1 && PyErr_Occurred())
15051 return -1;
15052 if (arg->prec < 0)
15053 arg->prec = 0;
15054 if (--ctx->fmtcnt >= 0) {
15055 arg->ch = FORMAT_READ(ctx);
15056 ctx->fmtpos++;
15057 }
15058 }
15059 else if (arg->ch >= '0' && arg->ch <= '9') {
15060 arg->prec = arg->ch - '0';
15061 while (--ctx->fmtcnt >= 0) {
15062 arg->ch = FORMAT_READ(ctx);
15063 ctx->fmtpos++;
15064 if (arg->ch < '0' || arg->ch > '9')
15065 break;
15066 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
15067 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020015068 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020015069 return -1;
15070 }
15071 arg->prec = arg->prec*10 + (arg->ch - '0');
15072 }
15073 }
15074 }
15075
15076 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
15077 if (ctx->fmtcnt >= 0) {
15078 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
15079 if (--ctx->fmtcnt >= 0) {
15080 arg->ch = FORMAT_READ(ctx);
15081 ctx->fmtpos++;
15082 }
15083 }
15084 }
15085 if (ctx->fmtcnt < 0) {
15086 PyErr_SetString(PyExc_ValueError,
15087 "incomplete format");
15088 return -1;
15089 }
15090 return 0;
15091
15092#undef FORMAT_READ
15093}
15094
15095/* Format one argument. Supported conversion specifiers:
15096
15097 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080015098 - "i", "d", "u": int or float
15099 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020015100 - "e", "E", "f", "F", "g", "G": float
15101 - "c": int or str (1 character)
15102
Victor Stinner8dbd4212012-12-04 09:30:24 +010015103 When possible, the output is written directly into the Unicode writer
15104 (ctx->writer). A string is created when padding is required.
15105
Victor Stinnera47082312012-10-04 02:19:54 +020015106 Return 0 if the argument has been formatted into *p_str,
15107 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010015108 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020015109static int
15110unicode_format_arg_format(struct unicode_formatter_t *ctx,
15111 struct unicode_format_arg_t *arg,
15112 PyObject **p_str)
15113{
15114 PyObject *v;
15115 _PyUnicodeWriter *writer = &ctx->writer;
15116
15117 if (ctx->fmtcnt == 0)
15118 ctx->writer.overallocate = 0;
15119
Victor Stinnera47082312012-10-04 02:19:54 +020015120 v = unicode_format_getnextarg(ctx);
15121 if (v == NULL)
15122 return -1;
15123
Victor Stinnera47082312012-10-04 02:19:54 +020015124
15125 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020015126 case 's':
15127 case 'r':
15128 case 'a':
15129 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
15130 /* Fast path */
15131 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
15132 return -1;
15133 return 1;
15134 }
15135
15136 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
15137 *p_str = v;
15138 Py_INCREF(*p_str);
15139 }
15140 else {
15141 if (arg->ch == 's')
15142 *p_str = PyObject_Str(v);
15143 else if (arg->ch == 'r')
15144 *p_str = PyObject_Repr(v);
15145 else
15146 *p_str = PyObject_ASCII(v);
15147 }
15148 break;
15149
15150 case 'i':
15151 case 'd':
15152 case 'u':
15153 case 'o':
15154 case 'x':
15155 case 'X':
15156 {
15157 int ret = mainformatlong(v, arg, p_str, writer);
15158 if (ret != 0)
15159 return ret;
15160 arg->sign = 1;
15161 break;
15162 }
15163
15164 case 'e':
15165 case 'E':
15166 case 'f':
15167 case 'F':
15168 case 'g':
15169 case 'G':
15170 if (arg->width == -1 && arg->prec == -1
15171 && !(arg->flags & (F_SIGN | F_BLANK)))
15172 {
15173 /* Fast path */
15174 if (formatfloat(v, arg, NULL, writer) == -1)
15175 return -1;
15176 return 1;
15177 }
15178
15179 arg->sign = 1;
15180 if (formatfloat(v, arg, p_str, NULL) == -1)
15181 return -1;
15182 break;
15183
15184 case 'c':
15185 {
15186 Py_UCS4 ch = formatchar(v);
15187 if (ch == (Py_UCS4) -1)
15188 return -1;
15189 if (arg->width == -1 && arg->prec == -1) {
15190 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020015191 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020015192 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020015193 return 1;
15194 }
15195 *p_str = PyUnicode_FromOrdinal(ch);
15196 break;
15197 }
15198
15199 default:
15200 PyErr_Format(PyExc_ValueError,
15201 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020015202 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020015203 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15204 (int)arg->ch,
15205 ctx->fmtpos - 1);
15206 return -1;
15207 }
15208 if (*p_str == NULL)
15209 return -1;
15210 assert (PyUnicode_Check(*p_str));
15211 return 0;
15212}
15213
15214static int
15215unicode_format_arg_output(struct unicode_formatter_t *ctx,
15216 struct unicode_format_arg_t *arg,
15217 PyObject *str)
15218{
15219 Py_ssize_t len;
15220 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015221 const void *pbuf;
Victor Stinnera47082312012-10-04 02:19:54 +020015222 Py_ssize_t pindex;
15223 Py_UCS4 signchar;
15224 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015225 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015226 Py_ssize_t sublen;
15227 _PyUnicodeWriter *writer = &ctx->writer;
15228 Py_UCS4 fill;
15229
15230 fill = ' ';
15231 if (arg->sign && arg->flags & F_ZERO)
15232 fill = '0';
15233
15234 if (PyUnicode_READY(str) == -1)
15235 return -1;
15236
15237 len = PyUnicode_GET_LENGTH(str);
15238 if ((arg->width == -1 || arg->width <= len)
15239 && (arg->prec == -1 || arg->prec >= len)
15240 && !(arg->flags & (F_SIGN | F_BLANK)))
15241 {
15242 /* Fast path */
15243 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15244 return -1;
15245 return 0;
15246 }
15247
15248 /* Truncate the string for "s", "r" and "a" formats
15249 if the precision is set */
15250 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15251 if (arg->prec >= 0 && len > arg->prec)
15252 len = arg->prec;
15253 }
15254
15255 /* Adjust sign and width */
15256 kind = PyUnicode_KIND(str);
15257 pbuf = PyUnicode_DATA(str);
15258 pindex = 0;
15259 signchar = '\0';
15260 if (arg->sign) {
15261 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15262 if (ch == '-' || ch == '+') {
15263 signchar = ch;
15264 len--;
15265 pindex++;
15266 }
15267 else if (arg->flags & F_SIGN)
15268 signchar = '+';
15269 else if (arg->flags & F_BLANK)
15270 signchar = ' ';
15271 else
15272 arg->sign = 0;
15273 }
15274 if (arg->width < len)
15275 arg->width = len;
15276
15277 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015278 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015279 if (!(arg->flags & F_LJUST)) {
15280 if (arg->sign) {
15281 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015282 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015283 }
15284 else {
15285 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015286 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015287 }
15288 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015289 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15290 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015291 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015292 }
15293
Victor Stinnera47082312012-10-04 02:19:54 +020015294 buflen = arg->width;
15295 if (arg->sign && len == arg->width)
15296 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015297 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020015298 return -1;
15299
15300 /* Write the sign if needed */
15301 if (arg->sign) {
15302 if (fill != ' ') {
15303 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15304 writer->pos += 1;
15305 }
15306 if (arg->width > len)
15307 arg->width--;
15308 }
15309
15310 /* Write the numeric prefix for "x", "X" and "o" formats
15311 if the alternate form is used.
15312 For example, write "0x" for the "%#x" format. */
15313 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15314 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15315 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15316 if (fill != ' ') {
15317 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15318 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15319 writer->pos += 2;
15320 pindex += 2;
15321 }
15322 arg->width -= 2;
15323 if (arg->width < 0)
15324 arg->width = 0;
15325 len -= 2;
15326 }
15327
15328 /* Pad left with the fill character if needed */
15329 if (arg->width > len && !(arg->flags & F_LJUST)) {
15330 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015331 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015332 writer->pos += sublen;
15333 arg->width = len;
15334 }
15335
15336 /* If padding with spaces: write sign if needed and/or numeric prefix if
15337 the alternate form is used */
15338 if (fill == ' ') {
15339 if (arg->sign) {
15340 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15341 writer->pos += 1;
15342 }
15343 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15344 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15345 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15346 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15347 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15348 writer->pos += 2;
15349 pindex += 2;
15350 }
15351 }
15352
15353 /* Write characters */
15354 if (len) {
15355 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15356 str, pindex, len);
15357 writer->pos += len;
15358 }
15359
15360 /* Pad right with the fill character if needed */
15361 if (arg->width > len) {
15362 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015363 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015364 writer->pos += sublen;
15365 }
15366 return 0;
15367}
15368
15369/* Helper of PyUnicode_Format(): format one arg.
15370 Return 0 on success, raise an exception and return -1 on error. */
15371static int
15372unicode_format_arg(struct unicode_formatter_t *ctx)
15373{
15374 struct unicode_format_arg_t arg;
15375 PyObject *str;
15376 int ret;
15377
Victor Stinner8dbd4212012-12-04 09:30:24 +010015378 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015379 if (arg.ch == '%') {
15380 ctx->fmtpos++;
15381 ctx->fmtcnt--;
15382 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15383 return -1;
15384 return 0;
15385 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015386 arg.flags = 0;
15387 arg.width = -1;
15388 arg.prec = -1;
15389 arg.sign = 0;
15390 str = NULL;
15391
Victor Stinnera47082312012-10-04 02:19:54 +020015392 ret = unicode_format_arg_parse(ctx, &arg);
15393 if (ret == -1)
15394 return -1;
15395
15396 ret = unicode_format_arg_format(ctx, &arg, &str);
15397 if (ret == -1)
15398 return -1;
15399
15400 if (ret != 1) {
15401 ret = unicode_format_arg_output(ctx, &arg, str);
15402 Py_DECREF(str);
15403 if (ret == -1)
15404 return -1;
15405 }
15406
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015407 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015408 PyErr_SetString(PyExc_TypeError,
15409 "not all arguments converted during string formatting");
15410 return -1;
15411 }
15412 return 0;
15413}
15414
Alexander Belopolsky40018472011-02-26 01:02:56 +000015415PyObject *
15416PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015417{
Victor Stinnera47082312012-10-04 02:19:54 +020015418 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015419
Guido van Rossumd57fd912000-03-10 22:53:23 +000015420 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015421 PyErr_BadInternalCall();
15422 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015423 }
Victor Stinnera47082312012-10-04 02:19:54 +020015424
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015425 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015426 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015427
15428 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015429 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15430 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15431 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15432 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015433
Victor Stinner8f674cc2013-04-17 23:02:17 +020015434 _PyUnicodeWriter_Init(&ctx.writer);
15435 ctx.writer.min_length = ctx.fmtcnt + 100;
15436 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015437
Guido van Rossumd57fd912000-03-10 22:53:23 +000015438 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015439 ctx.arglen = PyTuple_Size(args);
15440 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015441 }
15442 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015443 ctx.arglen = -1;
15444 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015445 }
Victor Stinnera47082312012-10-04 02:19:54 +020015446 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015447 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015448 ctx.dict = args;
15449 else
15450 ctx.dict = NULL;
15451 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015452
Victor Stinnera47082312012-10-04 02:19:54 +020015453 while (--ctx.fmtcnt >= 0) {
15454 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015455 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015456
15457 nonfmtpos = ctx.fmtpos++;
15458 while (ctx.fmtcnt >= 0 &&
15459 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15460 ctx.fmtpos++;
15461 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015462 }
Victor Stinnera47082312012-10-04 02:19:54 +020015463 if (ctx.fmtcnt < 0) {
15464 ctx.fmtpos--;
15465 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015466 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015467
Victor Stinnercfc4c132013-04-03 01:48:39 +020015468 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15469 nonfmtpos, ctx.fmtpos) < 0)
15470 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015471 }
15472 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015473 ctx.fmtpos++;
15474 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015475 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015476 }
15477 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015478
Victor Stinnera47082312012-10-04 02:19:54 +020015479 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015480 PyErr_SetString(PyExc_TypeError,
15481 "not all arguments converted during string formatting");
15482 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015483 }
15484
Victor Stinnera47082312012-10-04 02:19:54 +020015485 if (ctx.args_owned) {
15486 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015487 }
Victor Stinnera47082312012-10-04 02:19:54 +020015488 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015489
Benjamin Peterson29060642009-01-31 22:14:21 +000015490 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015491 _PyUnicodeWriter_Dealloc(&ctx.writer);
15492 if (ctx.args_owned) {
15493 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015494 }
15495 return NULL;
15496}
15497
Jeremy Hylton938ace62002-07-17 16:30:39 +000015498static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015499unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
15500
15501/*[clinic input]
15502@classmethod
15503str.__new__ as unicode_new
15504
15505 object as x: object = NULL
15506 encoding: str = NULL
15507 errors: str = NULL
15508
15509[clinic start generated code]*/
Guido van Rossume023fe02001-08-30 03:12:59 +000015510
Tim Peters6d6c1a32001-08-02 04:15:00 +000015511static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015512unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
15513 const char *errors)
15514/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
Tim Peters6d6c1a32001-08-02 04:15:00 +000015515{
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015516 PyObject *unicode;
15517 if (x == NULL) {
15518 unicode = unicode_new_empty();
15519 }
15520 else if (encoding == NULL && errors == NULL) {
15521 unicode = PyObject_Str(x);
15522 }
15523 else {
15524 unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
15525 }
Tim Peters6d6c1a32001-08-02 04:15:00 +000015526
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015527 if (unicode != NULL && type != &PyUnicode_Type) {
15528 Py_SETREF(unicode, unicode_subtype_new(type, unicode));
15529 }
15530 return unicode;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015531}
15532
Guido van Rossume023fe02001-08-30 03:12:59 +000015533static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015534unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
Guido van Rossume023fe02001-08-30 03:12:59 +000015535{
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015536 PyObject *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015537 Py_ssize_t length, char_size;
15538 int share_wstr, share_utf8;
15539 unsigned int kind;
15540 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015541
Benjamin Peterson14339b62009-01-31 16:36:08 +000015542 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner910337b2011-10-03 03:20:16 +020015543 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015544 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015545 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015546 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015547
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015548 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015549 if (self == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015550 return NULL;
15551 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015552 kind = PyUnicode_KIND(unicode);
15553 length = PyUnicode_GET_LENGTH(unicode);
15554
15555 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015556#ifdef Py_DEBUG
15557 _PyUnicode_HASH(self) = -1;
15558#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015559 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015560#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015561 _PyUnicode_STATE(self).interned = 0;
15562 _PyUnicode_STATE(self).kind = kind;
15563 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015564 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015565 _PyUnicode_STATE(self).ready = 1;
15566 _PyUnicode_WSTR(self) = NULL;
15567 _PyUnicode_UTF8_LENGTH(self) = 0;
15568 _PyUnicode_UTF8(self) = NULL;
15569 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015570 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015571
15572 share_utf8 = 0;
15573 share_wstr = 0;
15574 if (kind == PyUnicode_1BYTE_KIND) {
15575 char_size = 1;
15576 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15577 share_utf8 = 1;
15578 }
15579 else if (kind == PyUnicode_2BYTE_KIND) {
15580 char_size = 2;
15581 if (sizeof(wchar_t) == 2)
15582 share_wstr = 1;
15583 }
15584 else {
15585 assert(kind == PyUnicode_4BYTE_KIND);
15586 char_size = 4;
15587 if (sizeof(wchar_t) == 4)
15588 share_wstr = 1;
15589 }
15590
15591 /* Ensure we won't overflow the length. */
15592 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15593 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015594 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015595 }
Victor Stinner32bd68c2020-12-01 10:37:39 +010015596 data = PyObject_Malloc((length + 1) * char_size);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015597 if (data == NULL) {
15598 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015599 goto onError;
15600 }
15601
Victor Stinnerc3c74152011-10-02 20:39:55 +020015602 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015603 if (share_utf8) {
15604 _PyUnicode_UTF8_LENGTH(self) = length;
15605 _PyUnicode_UTF8(self) = data;
15606 }
15607 if (share_wstr) {
15608 _PyUnicode_WSTR_LENGTH(self) = length;
15609 _PyUnicode_WSTR(self) = (wchar_t *)data;
15610 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015611
Christian Heimesf051e432016-09-13 20:22:02 +020015612 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015613 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015614 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015615#ifdef Py_DEBUG
15616 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15617#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +010015618 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015619
15620onError:
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015621 Py_DECREF(self);
15622 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015623}
15624
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015625PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015626"str(object='') -> str\n\
15627str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015628\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015629Create a new string object from the given object. If encoding or\n\
15630errors is specified, then the object must expose a data buffer\n\
15631that will be decoded using the given encoding and error handler.\n\
15632Otherwise, returns the result of object.__str__() (if defined)\n\
15633or repr(object).\n\
15634encoding defaults to sys.getdefaultencoding().\n\
15635errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015636
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015637static PyObject *unicode_iter(PyObject *seq);
15638
Guido van Rossumd57fd912000-03-10 22:53:23 +000015639PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015640 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015641 "str", /* tp_name */
15642 sizeof(PyUnicodeObject), /* tp_basicsize */
15643 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015644 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015645 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015646 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015647 0, /* tp_getattr */
15648 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015649 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015650 unicode_repr, /* tp_repr */
15651 &unicode_as_number, /* tp_as_number */
15652 &unicode_as_sequence, /* tp_as_sequence */
15653 &unicode_as_mapping, /* tp_as_mapping */
15654 (hashfunc) unicode_hash, /* tp_hash*/
15655 0, /* tp_call*/
15656 (reprfunc) unicode_str, /* tp_str */
15657 PyObject_GenericGetAttr, /* tp_getattro */
15658 0, /* tp_setattro */
15659 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015660 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015661 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15662 unicode_doc, /* tp_doc */
15663 0, /* tp_traverse */
15664 0, /* tp_clear */
15665 PyUnicode_RichCompare, /* tp_richcompare */
15666 0, /* tp_weaklistoffset */
15667 unicode_iter, /* tp_iter */
15668 0, /* tp_iternext */
15669 unicode_methods, /* tp_methods */
15670 0, /* tp_members */
15671 0, /* tp_getset */
15672 &PyBaseObject_Type, /* tp_base */
15673 0, /* tp_dict */
15674 0, /* tp_descr_get */
15675 0, /* tp_descr_set */
15676 0, /* tp_dictoffset */
15677 0, /* tp_init */
15678 0, /* tp_alloc */
15679 unicode_new, /* tp_new */
15680 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015681};
15682
15683/* Initialize the Unicode implementation */
15684
Victor Stinner331a6a52019-05-27 16:39:22 +020015685PyStatus
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015686_PyUnicode_Init(PyThreadState *tstate)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015687{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015688 /* XXX - move this array to unicodectype.c ? */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015689 const Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015690 0x000A, /* LINE FEED */
15691 0x000D, /* CARRIAGE RETURN */
15692 0x001C, /* FILE SEPARATOR */
15693 0x001D, /* GROUP SEPARATOR */
15694 0x001E, /* RECORD SEPARATOR */
15695 0x0085, /* NEXT LINE */
15696 0x2028, /* LINE SEPARATOR */
15697 0x2029, /* PARAGRAPH SEPARATOR */
15698 };
15699
Victor Stinner91698d82020-06-25 14:07:40 +020015700 struct _Py_unicode_state *state = &tstate->interp->unicode;
15701 if (unicode_create_empty_string_singleton(state) < 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015702 return _PyStatus_NO_MEMORY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015703 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015704
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015705 if (_Py_IsMainInterpreter(tstate)) {
15706 /* initialize the linebreak bloom filter */
15707 bloom_linebreak = make_bloom_mask(
15708 PyUnicode_2BYTE_KIND, linebreak,
15709 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters477c8d52006-05-27 19:21:47 +000015710
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015711 if (PyType_Ready(&PyUnicode_Type) < 0) {
15712 return _PyStatus_ERR("Can't initialize unicode type");
15713 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015714
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015715 if (PyType_Ready(&EncodingMapType) < 0) {
15716 return _PyStatus_ERR("Can't initialize encoding map type");
15717 }
15718 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15719 return _PyStatus_ERR("Can't initialize field name iterator type");
15720 }
15721 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15722 return _PyStatus_ERR("Can't initialize formatter iter type");
15723 }
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015724 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015725 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015726}
15727
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015728
Walter Dörwald16807132007-05-25 13:52:07 +000015729void
15730PyUnicode_InternInPlace(PyObject **p)
15731{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015732 PyObject *s = *p;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015733#ifdef Py_DEBUG
15734 assert(s != NULL);
15735 assert(_PyUnicode_CHECK(s));
15736#else
Victor Stinner607b1022020-05-05 18:50:30 +020015737 if (s == NULL || !PyUnicode_Check(s)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020015738 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015739 }
Victor Stinner4fae54c2011-10-03 02:01:52 +020015740#endif
Victor Stinner607b1022020-05-05 18:50:30 +020015741
Benjamin Peterson14339b62009-01-31 16:36:08 +000015742 /* If it's a subclass, we don't really know what putting
15743 it in the interned dict might do. */
Victor Stinner607b1022020-05-05 18:50:30 +020015744 if (!PyUnicode_CheckExact(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015745 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015746 }
15747
15748 if (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015749 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015750 }
15751
Victor Stinner666ecfb2020-07-02 01:19:57 +020015752 if (PyUnicode_READY(s) == -1) {
15753 PyErr_Clear();
15754 return;
15755 }
15756
Victor Stinnerea251802020-12-26 02:58:33 +010015757 struct _Py_unicode_state *state = get_unicode_state();
15758 if (state->interned == NULL) {
15759 state->interned = PyDict_New();
15760 if (state->interned == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015761 PyErr_Clear(); /* Don't leave an exception */
15762 return;
15763 }
15764 }
Victor Stinner607b1022020-05-05 18:50:30 +020015765
Victor Stinnerea251802020-12-26 02:58:33 +010015766 PyObject *t = PyDict_SetDefault(state->interned, s, s);
Berker Peksagced8d4c2016-07-25 04:40:39 +030015767 if (t == NULL) {
15768 PyErr_Clear();
15769 return;
15770 }
Victor Stinner607b1022020-05-05 18:50:30 +020015771
Berker Peksagced8d4c2016-07-25 04:40:39 +030015772 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015773 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015774 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015775 return;
15776 }
Victor Stinner607b1022020-05-05 18:50:30 +020015777
Victor Stinner3549ca32020-07-03 16:59:12 +020015778 /* The two references in interned dict (key and value) are not counted by
15779 refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
15780 this. */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015781 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015782 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015783}
15784
Victor Stinnerea251802020-12-26 02:58:33 +010015785
Walter Dörwald16807132007-05-25 13:52:07 +000015786void
15787PyUnicode_InternImmortal(PyObject **p)
15788{
Victor Stinner583ee5a2020-10-02 14:49:00 +020015789 if (PyErr_WarnEx(PyExc_DeprecationWarning,
15790 "PyUnicode_InternImmortal() is deprecated; "
15791 "use PyUnicode_InternInPlace() instead", 1) < 0)
15792 {
15793 // The function has no return value, the exception cannot
15794 // be reported to the caller, so just log it.
15795 PyErr_WriteUnraisable(NULL);
15796 }
15797
Benjamin Peterson14339b62009-01-31 16:36:08 +000015798 PyUnicode_InternInPlace(p);
15799 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015800 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015801 Py_INCREF(*p);
15802 }
Walter Dörwald16807132007-05-25 13:52:07 +000015803}
15804
15805PyObject *
15806PyUnicode_InternFromString(const char *cp)
15807{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015808 PyObject *s = PyUnicode_FromString(cp);
15809 if (s == NULL)
15810 return NULL;
15811 PyUnicode_InternInPlace(&s);
15812 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015813}
15814
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015815
Victor Stinner666ecfb2020-07-02 01:19:57 +020015816void
15817_PyUnicode_ClearInterned(PyThreadState *tstate)
Walter Dörwald16807132007-05-25 13:52:07 +000015818{
Victor Stinnerea251802020-12-26 02:58:33 +010015819 struct _Py_unicode_state *state = &tstate->interp->unicode;
15820 if (state->interned == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015821 return;
15822 }
Victor Stinnerea251802020-12-26 02:58:33 +010015823 assert(PyDict_CheckExact(state->interned));
Victor Stinner666ecfb2020-07-02 01:19:57 +020015824
15825 /* Interned unicode strings are not forcibly deallocated; rather, we give
15826 them their stolen references back, and then clear and DECREF the
15827 interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015828
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015829#ifdef INTERNED_STATS
Victor Stinnerea251802020-12-26 02:58:33 +010015830 fprintf(stderr, "releasing %zd interned strings\n",
15831 PyDict_GET_SIZE(state->interned));
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015832
15833 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015834#endif
Victor Stinnerea251802020-12-26 02:58:33 +010015835 Py_ssize_t pos = 0;
15836 PyObject *s, *ignored_value;
15837 while (PyDict_Next(state->interned, &pos, &s, &ignored_value)) {
Victor Stinner666ecfb2020-07-02 01:19:57 +020015838 assert(PyUnicode_IS_READY(s));
15839
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015840 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015841 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015842 Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015843#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015844 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015845#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015846 break;
15847 case SSTATE_INTERNED_MORTAL:
Victor Stinner3549ca32020-07-03 16:59:12 +020015848 // Restore the two references (key and value) ignored
15849 // by PyUnicode_InternInPlace().
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015850 Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015851#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015852 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015853#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015854 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015855 case SSTATE_NOT_INTERNED:
15856 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015857 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015858 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015859 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015860 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015861 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015862#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015863 fprintf(stderr,
15864 "total size of all interned strings: %zd/%zd mortal/immortal\n",
15865 mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015866#endif
Victor Stinner666ecfb2020-07-02 01:19:57 +020015867
Victor Stinnerea251802020-12-26 02:58:33 +010015868 PyDict_Clear(state->interned);
15869 Py_CLEAR(state->interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015870}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015871
15872
15873/********************* Unicode Iterator **************************/
15874
15875typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015876 PyObject_HEAD
15877 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015878 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015879} unicodeiterobject;
15880
15881static void
15882unicodeiter_dealloc(unicodeiterobject *it)
15883{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015884 _PyObject_GC_UNTRACK(it);
15885 Py_XDECREF(it->it_seq);
15886 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015887}
15888
15889static int
15890unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15891{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015892 Py_VISIT(it->it_seq);
15893 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015894}
15895
15896static PyObject *
15897unicodeiter_next(unicodeiterobject *it)
15898{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015899 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015900
Benjamin Peterson14339b62009-01-31 16:36:08 +000015901 assert(it != NULL);
15902 seq = it->it_seq;
15903 if (seq == NULL)
15904 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015905 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015906
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015907 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15908 int kind = PyUnicode_KIND(seq);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015909 const void *data = PyUnicode_DATA(seq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015910 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15911 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015912 if (item != NULL)
15913 ++it->it_index;
15914 return item;
15915 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015916
Benjamin Peterson14339b62009-01-31 16:36:08 +000015917 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015918 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015919 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015920}
15921
15922static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015923unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015924{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015925 Py_ssize_t len = 0;
15926 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015927 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015928 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015929}
15930
15931PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15932
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015933static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015934unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015935{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015936 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015937 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015938 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015939 it->it_seq, it->it_index);
15940 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015941 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015942 if (u == NULL)
15943 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015944 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015945 }
15946}
15947
15948PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15949
15950static PyObject *
15951unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15952{
15953 Py_ssize_t index = PyLong_AsSsize_t(state);
15954 if (index == -1 && PyErr_Occurred())
15955 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015956 if (it->it_seq != NULL) {
15957 if (index < 0)
15958 index = 0;
15959 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15960 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15961 it->it_index = index;
15962 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015963 Py_RETURN_NONE;
15964}
15965
15966PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15967
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015968static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015969 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015970 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015971 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15972 reduce_doc},
15973 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15974 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015975 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015976};
15977
15978PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015979 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15980 "str_iterator", /* tp_name */
15981 sizeof(unicodeiterobject), /* tp_basicsize */
15982 0, /* tp_itemsize */
15983 /* methods */
15984 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015985 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015986 0, /* tp_getattr */
15987 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015988 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015989 0, /* tp_repr */
15990 0, /* tp_as_number */
15991 0, /* tp_as_sequence */
15992 0, /* tp_as_mapping */
15993 0, /* tp_hash */
15994 0, /* tp_call */
15995 0, /* tp_str */
15996 PyObject_GenericGetAttr, /* tp_getattro */
15997 0, /* tp_setattro */
15998 0, /* tp_as_buffer */
15999 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
16000 0, /* tp_doc */
16001 (traverseproc)unicodeiter_traverse, /* tp_traverse */
16002 0, /* tp_clear */
16003 0, /* tp_richcompare */
16004 0, /* tp_weaklistoffset */
16005 PyObject_SelfIter, /* tp_iter */
16006 (iternextfunc)unicodeiter_next, /* tp_iternext */
16007 unicodeiter_methods, /* tp_methods */
16008 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016009};
16010
16011static PyObject *
16012unicode_iter(PyObject *seq)
16013{
Benjamin Peterson14339b62009-01-31 16:36:08 +000016014 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016015
Benjamin Peterson14339b62009-01-31 16:36:08 +000016016 if (!PyUnicode_Check(seq)) {
16017 PyErr_BadInternalCall();
16018 return NULL;
16019 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020016020 if (PyUnicode_READY(seq) == -1)
16021 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000016022 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
16023 if (it == NULL)
16024 return NULL;
16025 it->it_index = 0;
16026 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020016027 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000016028 _PyObject_GC_TRACK(it);
16029 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016030}
16031
Victor Stinner709d23d2019-05-02 14:56:30 -040016032static int
16033encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016034{
Victor Stinner709d23d2019-05-02 14:56:30 -040016035 int res;
16036 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
16037 if (res == -2) {
16038 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
16039 return -1;
16040 }
16041 if (res < 0) {
16042 PyErr_NoMemory();
16043 return -1;
16044 }
16045 return 0;
16046}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016047
Victor Stinner709d23d2019-05-02 14:56:30 -040016048
16049static int
16050config_get_codec_name(wchar_t **config_encoding)
16051{
16052 char *encoding;
16053 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
16054 return -1;
16055 }
16056
16057 PyObject *name_obj = NULL;
16058 PyObject *codec = _PyCodec_Lookup(encoding);
16059 PyMem_RawFree(encoding);
16060
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016061 if (!codec)
16062 goto error;
16063
16064 name_obj = PyObject_GetAttrString(codec, "name");
16065 Py_CLEAR(codec);
16066 if (!name_obj) {
16067 goto error;
16068 }
16069
Victor Stinner709d23d2019-05-02 14:56:30 -040016070 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16071 Py_DECREF(name_obj);
16072 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016073 goto error;
16074 }
16075
Victor Stinner709d23d2019-05-02 14:56:30 -040016076 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16077 if (raw_wname == NULL) {
16078 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016079 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040016080 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016081 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016082
16083 PyMem_RawFree(*config_encoding);
16084 *config_encoding = raw_wname;
16085
16086 PyMem_Free(wname);
16087 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016088
16089error:
16090 Py_XDECREF(codec);
16091 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040016092 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016093}
16094
16095
Victor Stinner331a6a52019-05-27 16:39:22 +020016096static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016097init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016098{
Victor Stinner709d23d2019-05-02 14:56:30 -040016099 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016100 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(tstate->interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016101 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016102 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016103 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016104 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016105 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016106}
16107
16108
Victor Stinner709d23d2019-05-02 14:56:30 -040016109static int
16110init_fs_codec(PyInterpreterState *interp)
16111{
Victor Stinnerda7933e2020-04-13 03:04:28 +020016112 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016113
16114 _Py_error_handler error_handler;
16115 error_handler = get_error_handler_wide(config->filesystem_errors);
16116 if (error_handler == _Py_ERROR_UNKNOWN) {
16117 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
16118 return -1;
16119 }
16120
16121 char *encoding, *errors;
16122 if (encode_wstr_utf8(config->filesystem_encoding,
16123 &encoding,
16124 "filesystem_encoding") < 0) {
16125 return -1;
16126 }
16127
16128 if (encode_wstr_utf8(config->filesystem_errors,
16129 &errors,
16130 "filesystem_errors") < 0) {
16131 PyMem_RawFree(encoding);
16132 return -1;
16133 }
16134
Victor Stinner3d17c042020-05-14 01:48:38 +020016135 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16136 PyMem_RawFree(fs_codec->encoding);
16137 fs_codec->encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016138 /* encoding has been normalized by init_fs_encoding() */
Victor Stinner3d17c042020-05-14 01:48:38 +020016139 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16140 PyMem_RawFree(fs_codec->errors);
16141 fs_codec->errors = errors;
16142 fs_codec->error_handler = error_handler;
Victor Stinner709d23d2019-05-02 14:56:30 -040016143
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016144#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +020016145 assert(fs_codec->utf8 == 1);
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016146#endif
16147
Victor Stinner709d23d2019-05-02 14:56:30 -040016148 /* At this point, PyUnicode_EncodeFSDefault() and
16149 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16150 the C implementation of the filesystem encoding. */
16151
16152 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16153 global configuration variables. */
Victor Stinner3d17c042020-05-14 01:48:38 +020016154 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16155 fs_codec->errors) < 0) {
Victor Stinner709d23d2019-05-02 14:56:30 -040016156 PyErr_NoMemory();
16157 return -1;
16158 }
16159 return 0;
16160}
16161
16162
Victor Stinner331a6a52019-05-27 16:39:22 +020016163static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016164init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016165{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016166 PyInterpreterState *interp = tstate->interp;
16167
Victor Stinner709d23d2019-05-02 14:56:30 -040016168 /* Update the filesystem encoding to the normalized Python codec name.
16169 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16170 (Python codec name). */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016171 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016172 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016173 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016174 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016175 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016176 }
16177
Victor Stinner709d23d2019-05-02 14:56:30 -040016178 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016179 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016180 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016181 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016182}
16183
16184
Victor Stinner331a6a52019-05-27 16:39:22 +020016185PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020016186_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016187{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016188 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016189 if (_PyStatus_EXCEPTION(status)) {
16190 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016191 }
16192
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016193 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016194}
16195
16196
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016197static void
Victor Stinner3d17c042020-05-14 01:48:38 +020016198_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016199{
Victor Stinner3d17c042020-05-14 01:48:38 +020016200 PyMem_RawFree(fs_codec->encoding);
16201 fs_codec->encoding = NULL;
16202 fs_codec->utf8 = 0;
16203 PyMem_RawFree(fs_codec->errors);
16204 fs_codec->errors = NULL;
16205 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016206}
16207
16208
Victor Stinner709d23d2019-05-02 14:56:30 -040016209#ifdef MS_WINDOWS
16210int
16211_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16212{
Victor Stinner81a7be32020-04-14 15:14:01 +020016213 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerda7933e2020-04-13 03:04:28 +020016214 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016215
16216 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16217 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16218 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16219 if (encoding == NULL || errors == NULL) {
16220 PyMem_RawFree(encoding);
16221 PyMem_RawFree(errors);
16222 PyErr_NoMemory();
16223 return -1;
16224 }
16225
16226 PyMem_RawFree(config->filesystem_encoding);
16227 config->filesystem_encoding = encoding;
16228 PyMem_RawFree(config->filesystem_errors);
16229 config->filesystem_errors = errors;
16230
16231 return init_fs_codec(interp);
16232}
16233#endif
16234
16235
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016236void
Victor Stinner3d483342019-11-22 12:27:50 +010016237_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016238{
Victor Stinner666ecfb2020-07-02 01:19:57 +020016239 struct _Py_unicode_state *state = &tstate->interp->unicode;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016240
Victor Stinnerea251802020-12-26 02:58:33 +010016241 // _PyUnicode_ClearInterned() must be called before
16242 assert(state->interned == NULL);
16243
16244 _PyUnicode_FiniEncodings(&state->fs_codec);
16245
16246 unicode_clear_identifiers(tstate);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016247
Victor Stinner2f9ada92020-06-24 02:22:21 +020016248 for (Py_ssize_t i = 0; i < 256; i++) {
16249 Py_CLEAR(state->latin1[i]);
16250 }
Victor Stinnerea251802020-12-26 02:58:33 +010016251 Py_CLEAR(state->empty_string);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016252}
16253
16254
Georg Brandl66c221e2010-10-14 07:04:07 +000016255/* A _string module, to export formatter_parser and formatter_field_name_split
16256 to the string.Formatter class implemented in Python. */
16257
16258static PyMethodDef _string_methods[] = {
16259 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16260 METH_O, PyDoc_STR("split the argument as a field name")},
16261 {"formatter_parser", (PyCFunction) formatter_parser,
16262 METH_O, PyDoc_STR("parse the argument as a format string")},
16263 {NULL, NULL}
16264};
16265
16266static struct PyModuleDef _string_module = {
16267 PyModuleDef_HEAD_INIT,
Victor Stinnerbb083d32020-09-08 15:33:08 +020016268 .m_name = "_string",
16269 .m_doc = PyDoc_STR("string helper module"),
16270 .m_size = 0,
16271 .m_methods = _string_methods,
Georg Brandl66c221e2010-10-14 07:04:07 +000016272};
16273
16274PyMODINIT_FUNC
16275PyInit__string(void)
16276{
Victor Stinnerbb083d32020-09-08 15:33:08 +020016277 return PyModuleDef_Init(&_string_module);
Georg Brandl66c221e2010-10-14 07:04:07 +000016278}
16279
16280
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016281#ifdef __cplusplus
16282}
16283#endif