blob: c72871074b3ebd190a12fa7e6f74fd32db462256 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner47e1afd2020-10-26 16:43:47 +010043#include "pycore_abstract.h" // _PyIndex_Check()
Victor Stinnerba3d67c2020-12-26 00:41:46 +010044#include "pycore_atomic_funcs.h" // _Py_atomic_size_get()
Victor Stinner47e1afd2020-10-26 16:43:47 +010045#include "pycore_bytes_methods.h" // _Py_bytes_lower()
Serhiy Storchaka2ad93822020-12-03 12:46:16 +020046#include "pycore_format.h" // F_LJUST
Victor Stinner47e1afd2020-10-26 16:43:47 +010047#include "pycore_initconfig.h" // _PyStatus_OK()
48#include "pycore_interp.h" // PyInterpreterState.fs_codec
49#include "pycore_object.h" // _PyObject_GC_TRACK()
50#include "pycore_pathconfig.h" // _Py_DumpPathConfig()
51#include "pycore_pylifecycle.h" // _Py_SetFileSystemEncoding()
52#include "pycore_pystate.h" // _PyInterpreterState_GET()
53#include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
54#include "stringlib/eq.h" // unicode_eq()
Guido van Rossumd57fd912000-03-10 22:53:23 +000055
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000056#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000057#include <windows.h>
58#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059
Jakub Kulík9032cf52021-04-30 15:21:42 +020060#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
61#include "pycore_fileutils.h" // _Py_LocaleUsesNonUnicodeWchar()
62#endif
63
Victor Stinner666ecfb2020-07-02 01:19:57 +020064/* Uncomment to display statistics on interned strings at exit
65 in _PyUnicode_ClearInterned(). */
Victor Stinnerfecc4f22019-03-19 14:20:29 +010066/* #define INTERNED_STATS 1 */
67
68
Larry Hastings61272b72014-01-07 12:41:53 -080069/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090070class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080071[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090072/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
73
74/*[python input]
75class Py_UCS4_converter(CConverter):
76 type = 'Py_UCS4'
77 converter = 'convert_uc'
78
79 def converter_init(self):
80 if self.default is not unspecified:
81 self.c_default = ascii(self.default)
82 if len(self.c_default) > 4 or self.c_default[0] != "'":
83 self.c_default = hex(ord(self.default))
84
85[python start generated code]*/
86/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080087
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000088/* --- Globals ------------------------------------------------------------
89
Serhiy Storchaka05997252013-01-26 12:14:02 +020090NOTE: In the interpreter's initialization phase, some globals are currently
91 initialized dynamically as needed. In the process Unicode objects may
92 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000093
94*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000095
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000096
97#ifdef __cplusplus
98extern "C" {
99#endif
100
Victor Stinner99768342021-03-17 21:46:53 +0100101// Maximum code point of Unicode 6.0: 0x10ffff (1,114,111).
102// The value must be the same in fileutils.c.
Victor Stinner8faf8212011-12-08 22:14:11 +0100103#define MAX_UNICODE 0x10ffff
104
Victor Stinner910337b2011-10-03 03:20:16 +0200105#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200106# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +0200107#else
108# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
109#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200110
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111#define _PyUnicode_UTF8(op) \
112 (((PyCompactUnicodeObject*)(op))->utf8)
113#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200114 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200115 assert(PyUnicode_IS_READY(op)), \
116 PyUnicode_IS_COMPACT_ASCII(op) ? \
117 ((char*)((PyASCIIObject*)(op) + 1)) : \
118 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200119#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200120 (((PyCompactUnicodeObject*)(op))->utf8_length)
121#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200122 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200123 assert(PyUnicode_IS_READY(op)), \
124 PyUnicode_IS_COMPACT_ASCII(op) ? \
125 ((PyASCIIObject*)(op))->length : \
126 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200127#define _PyUnicode_WSTR(op) \
128 (((PyASCIIObject*)(op))->wstr)
Inada Naoki2c4928d2020-06-17 20:09:44 +0900129
130/* Don't use deprecated macro of unicodeobject.h */
131#undef PyUnicode_WSTR_LENGTH
132#define PyUnicode_WSTR_LENGTH(op) \
133 (PyUnicode_IS_COMPACT_ASCII(op) ? \
134 ((PyASCIIObject*)op)->length : \
135 ((PyCompactUnicodeObject*)op)->wstr_length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200136#define _PyUnicode_WSTR_LENGTH(op) \
137 (((PyCompactUnicodeObject*)(op))->wstr_length)
138#define _PyUnicode_LENGTH(op) \
139 (((PyASCIIObject *)(op))->length)
140#define _PyUnicode_STATE(op) \
141 (((PyASCIIObject *)(op))->state)
142#define _PyUnicode_HASH(op) \
143 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200144#define _PyUnicode_KIND(op) \
145 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200146 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200147#define _PyUnicode_GET_LENGTH(op) \
148 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200149 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200150#define _PyUnicode_DATA_ANY(op) \
151 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200152
Victor Stinner910337b2011-10-03 03:20:16 +0200153#undef PyUnicode_READY
154#define PyUnicode_READY(op) \
155 (assert(_PyUnicode_CHECK(op)), \
156 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200157 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100158 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200159
Victor Stinnerc379ead2011-10-03 12:52:27 +0200160#define _PyUnicode_SHARE_UTF8(op) \
161 (assert(_PyUnicode_CHECK(op)), \
162 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
163 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
164#define _PyUnicode_SHARE_WSTR(op) \
165 (assert(_PyUnicode_CHECK(op)), \
166 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
167
Victor Stinner829c0ad2011-10-03 01:08:02 +0200168/* true if the Unicode object has an allocated UTF-8 memory block
169 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200170#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200171 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200172 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200173 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
174
Victor Stinner03490912011-10-03 23:45:12 +0200175/* true if the Unicode object has an allocated wstr memory block
176 (not shared with other data) */
177#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200178 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200179 (!PyUnicode_IS_READY(op) || \
180 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
181
Victor Stinner910337b2011-10-03 03:20:16 +0200182/* Generic helper macro to convert characters of different types.
183 from_type and to_type have to be valid type names, begin and end
184 are pointers to the source characters which should be of type
185 "from_type *". to is a pointer of type "to_type *" and points to the
186 buffer where the result characters are written to. */
187#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
188 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100189 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600190 const from_type *_iter = (const from_type *)(begin);\
191 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200192 Py_ssize_t n = (_end) - (_iter); \
193 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200194 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200195 while (_iter < (_unrolled_end)) { \
196 _to[0] = (to_type) _iter[0]; \
197 _to[1] = (to_type) _iter[1]; \
198 _to[2] = (to_type) _iter[2]; \
199 _to[3] = (to_type) _iter[3]; \
200 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200201 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200202 while (_iter < (_end)) \
203 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200204 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200205
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200206#ifdef MS_WINDOWS
207 /* On Windows, overallocate by 50% is the best factor */
208# define OVERALLOCATE_FACTOR 2
209#else
210 /* On Linux, overallocate by 25% is the best factor */
211# define OVERALLOCATE_FACTOR 4
212#endif
213
Walter Dörwald16807132007-05-25 13:52:07 +0000214
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200215static struct _Py_unicode_state*
216get_unicode_state(void)
217{
218 PyInterpreterState *interp = _PyInterpreterState_GET();
219 return &interp->unicode;
220}
Serhiy Storchaka05997252013-01-26 12:14:02 +0200221
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000222
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200223// Return a borrowed reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200224static inline PyObject* unicode_get_empty(void)
225{
226 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner90ed8a62020-06-24 00:34:07 +0200227 // unicode_get_empty() must not be called before _PyUnicode_Init()
228 // or after _PyUnicode_Fini()
Victor Stinner91698d82020-06-25 14:07:40 +0200229 assert(state->empty_string != NULL);
230 return state->empty_string;
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200231}
232
Victor Stinner91698d82020-06-25 14:07:40 +0200233
234// Return a strong reference to the empty string singleton.
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200235static inline PyObject* unicode_new_empty(void)
236{
Victor Stinner90ed8a62020-06-24 00:34:07 +0200237 PyObject *empty = unicode_get_empty();
238 Py_INCREF(empty);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200239 return empty;
240}
241
242#define _Py_RETURN_UNICODE_EMPTY() \
243 do { \
244 return unicode_new_empty(); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200245 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000246
Victor Stinner59423e32018-11-26 13:40:01 +0100247static inline void
248unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
249 Py_ssize_t start, Py_ssize_t length)
250{
251 assert(0 <= start);
252 assert(kind != PyUnicode_WCHAR_KIND);
253 switch (kind) {
254 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100255 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100256 Py_UCS1 ch = (unsigned char)value;
257 Py_UCS1 *to = (Py_UCS1 *)data + start;
258 memset(to, ch, length);
259 break;
260 }
261 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100262 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100263 Py_UCS2 ch = (Py_UCS2)value;
264 Py_UCS2 *to = (Py_UCS2 *)data + start;
265 const Py_UCS2 *end = to + length;
266 for (; to < end; ++to) *to = ch;
267 break;
268 }
269 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100270 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100271 Py_UCS4 ch = value;
272 Py_UCS4 * to = (Py_UCS4 *)data + start;
273 const Py_UCS4 *end = to + length;
274 for (; to < end; ++to) *to = ch;
275 break;
276 }
277 default: Py_UNREACHABLE();
278 }
279}
280
281
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200282/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700283static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200284_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900285static inline void
286_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400287static PyObject *
288unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
289 const char *errors);
290static PyObject *
291unicode_decode_utf8(const char *s, Py_ssize_t size,
292 _Py_error_handler error_handler, const char *errors,
293 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200294
Christian Heimes190d79e2008-01-30 11:58:22 +0000295/* Fast detection of the most frequent whitespace characters */
296const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000298/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000299/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000300/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000301/* case 0x000C: * FORM FEED */
302/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000303 0, 1, 1, 1, 1, 1, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000305/* case 0x001C: * FILE SEPARATOR */
306/* case 0x001D: * GROUP SEPARATOR */
307/* case 0x001E: * RECORD SEPARATOR */
308/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000309 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000310/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000311 1, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000315
Benjamin Peterson14339b62009-01-31 16:36:08 +0000316 0, 0, 0, 0, 0, 0, 0, 0,
317 0, 0, 0, 0, 0, 0, 0, 0,
318 0, 0, 0, 0, 0, 0, 0, 0,
319 0, 0, 0, 0, 0, 0, 0, 0,
320 0, 0, 0, 0, 0, 0, 0, 0,
321 0, 0, 0, 0, 0, 0, 0, 0,
322 0, 0, 0, 0, 0, 0, 0, 0,
323 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000324};
325
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200326/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200327static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200328static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100329static int unicode_modifiable(PyObject *unicode);
330
Victor Stinnerfe226c02011-10-03 03:52:20 +0200331
Alexander Belopolsky40018472011-02-26 01:02:56 +0000332static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100333_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200334static PyObject *
335_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
336static PyObject *
337_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
338
339static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000340unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000341 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100342 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000343 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
344
Alexander Belopolsky40018472011-02-26 01:02:56 +0000345static void
346raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300347 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100348 PyObject *unicode,
349 Py_ssize_t startpos, Py_ssize_t endpos,
350 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000351
Christian Heimes190d79e2008-01-30 11:58:22 +0000352/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200353static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000354 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000355/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000356/* 0x000B, * LINE TABULATION */
357/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000358/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000359 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000360 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000361/* 0x001C, * FILE SEPARATOR */
362/* 0x001D, * GROUP SEPARATOR */
363/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000364 0, 0, 0, 0, 1, 1, 1, 0,
365 0, 0, 0, 0, 0, 0, 0, 0,
366 0, 0, 0, 0, 0, 0, 0, 0,
367 0, 0, 0, 0, 0, 0, 0, 0,
368 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000369
Benjamin Peterson14339b62009-01-31 16:36:08 +0000370 0, 0, 0, 0, 0, 0, 0, 0,
371 0, 0, 0, 0, 0, 0, 0, 0,
372 0, 0, 0, 0, 0, 0, 0, 0,
373 0, 0, 0, 0, 0, 0, 0, 0,
374 0, 0, 0, 0, 0, 0, 0, 0,
375 0, 0, 0, 0, 0, 0, 0, 0,
376 0, 0, 0, 0, 0, 0, 0, 0,
377 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000378};
379
INADA Naoki3ae20562017-01-16 20:41:20 +0900380static int convert_uc(PyObject *obj, void *addr);
381
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300382#include "clinic/unicodeobject.c.h"
383
Victor Stinner3d4226a2018-08-29 22:21:32 +0200384_Py_error_handler
385_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200386{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200387 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200388 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200389 }
390 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200391 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200392 }
393 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200394 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200395 }
396 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200397 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200398 }
399 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200400 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200401 }
402 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200403 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200404 }
405 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200406 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200407 }
Victor Stinner50149202015-09-22 00:26:54 +0200408 return _Py_ERROR_OTHER;
409}
410
Victor Stinner709d23d2019-05-02 14:56:30 -0400411
412static _Py_error_handler
413get_error_handler_wide(const wchar_t *errors)
414{
415 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
416 return _Py_ERROR_STRICT;
417 }
418 if (wcscmp(errors, L"surrogateescape") == 0) {
419 return _Py_ERROR_SURROGATEESCAPE;
420 }
421 if (wcscmp(errors, L"replace") == 0) {
422 return _Py_ERROR_REPLACE;
423 }
424 if (wcscmp(errors, L"ignore") == 0) {
425 return _Py_ERROR_IGNORE;
426 }
427 if (wcscmp(errors, L"backslashreplace") == 0) {
428 return _Py_ERROR_BACKSLASHREPLACE;
429 }
430 if (wcscmp(errors, L"surrogatepass") == 0) {
431 return _Py_ERROR_SURROGATEPASS;
432 }
433 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
434 return _Py_ERROR_XMLCHARREFREPLACE;
435 }
436 return _Py_ERROR_OTHER;
437}
438
439
Victor Stinner22eb6892019-06-26 00:51:05 +0200440static inline int
441unicode_check_encoding_errors(const char *encoding, const char *errors)
442{
443 if (encoding == NULL && errors == NULL) {
444 return 0;
445 }
446
Victor Stinner81a7be32020-04-14 15:14:01 +0200447 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner22eb6892019-06-26 00:51:05 +0200448#ifndef Py_DEBUG
449 /* In release mode, only check in development mode (-X dev) */
Victor Stinnerda7933e2020-04-13 03:04:28 +0200450 if (!_PyInterpreterState_GetConfig(interp)->dev_mode) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200451 return 0;
452 }
453#else
454 /* Always check in debug mode */
455#endif
456
457 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
458 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
Victor Stinner3d17c042020-05-14 01:48:38 +0200459 if (!interp->unicode.fs_codec.encoding) {
Victor Stinner22eb6892019-06-26 00:51:05 +0200460 return 0;
461 }
462
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200463 /* Disable checks during Python finalization. For example, it allows to
464 call _PyObject_Dump() during finalization for debugging purpose. */
465 if (interp->finalizing) {
466 return 0;
467 }
468
Victor Stinner22eb6892019-06-26 00:51:05 +0200469 if (encoding != NULL) {
470 PyObject *handler = _PyCodec_Lookup(encoding);
471 if (handler == NULL) {
472 return -1;
473 }
474 Py_DECREF(handler);
475 }
476
477 if (errors != NULL) {
478 PyObject *handler = PyCodec_LookupError(errors);
479 if (handler == NULL) {
480 return -1;
481 }
482 Py_DECREF(handler);
483 }
484 return 0;
485}
486
487
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200488int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100489_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200490{
Victor Stinner68762572019-10-07 18:42:01 +0200491#define CHECK(expr) \
492 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
493
Victor Stinner910337b2011-10-03 03:20:16 +0200494 PyASCIIObject *ascii;
495 unsigned int kind;
496
Victor Stinner68762572019-10-07 18:42:01 +0200497 assert(op != NULL);
498 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200499
500 ascii = (PyASCIIObject *)op;
501 kind = ascii->state.kind;
502
Victor Stinnera3b334d2011-10-03 13:53:37 +0200503 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200504 CHECK(kind == PyUnicode_1BYTE_KIND);
505 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200506 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200507 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200508 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200509 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200510
Victor Stinnera41463c2011-10-04 01:05:08 +0200511 if (ascii->state.compact == 1) {
512 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200513 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200514 || kind == PyUnicode_2BYTE_KIND
515 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200516 CHECK(ascii->state.ascii == 0);
517 CHECK(ascii->state.ready == 1);
518 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100519 }
520 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200521 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
522
523 data = unicode->data.any;
524 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200525 CHECK(ascii->length == 0);
526 CHECK(ascii->hash == -1);
527 CHECK(ascii->state.compact == 0);
528 CHECK(ascii->state.ascii == 0);
529 CHECK(ascii->state.ready == 0);
530 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
531 CHECK(ascii->wstr != NULL);
532 CHECK(data == NULL);
533 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200534 }
535 else {
Victor Stinner68762572019-10-07 18:42:01 +0200536 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200537 || kind == PyUnicode_2BYTE_KIND
538 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200539 CHECK(ascii->state.compact == 0);
540 CHECK(ascii->state.ready == 1);
541 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200542 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200543 CHECK(compact->utf8 == data);
544 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200545 }
546 else
Victor Stinner68762572019-10-07 18:42:01 +0200547 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200548 }
549 }
550 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200551 if (
552#if SIZEOF_WCHAR_T == 2
553 kind == PyUnicode_2BYTE_KIND
554#else
555 kind == PyUnicode_4BYTE_KIND
556#endif
557 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200558 {
Victor Stinner68762572019-10-07 18:42:01 +0200559 CHECK(ascii->wstr == data);
560 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200561 } else
Victor Stinner68762572019-10-07 18:42:01 +0200562 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200563 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200564
565 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200566 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200567 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200568 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200569 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200570
571 /* check that the best kind is used: O(n) operation */
572 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200573 Py_ssize_t i;
574 Py_UCS4 maxchar = 0;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300575 const void *data;
Victor Stinner718fbf02012-04-26 00:39:37 +0200576 Py_UCS4 ch;
577
578 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200579 for (i=0; i < ascii->length; i++)
580 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200581 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200582 if (ch > maxchar)
583 maxchar = ch;
584 }
585 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100586 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200587 CHECK(maxchar >= 128);
588 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100589 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200590 else
Victor Stinner68762572019-10-07 18:42:01 +0200591 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200592 }
Victor Stinner77faf692011-11-20 18:56:05 +0100593 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200594 CHECK(maxchar >= 0x100);
595 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100596 }
597 else {
Victor Stinner68762572019-10-07 18:42:01 +0200598 CHECK(maxchar >= 0x10000);
599 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100600 }
Victor Stinner68762572019-10-07 18:42:01 +0200601 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200602 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400603 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200604
605#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400606}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200607
Victor Stinner910337b2011-10-03 03:20:16 +0200608
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100609static PyObject*
610unicode_result_wchar(PyObject *unicode)
611{
612#ifndef Py_DEBUG
613 Py_ssize_t len;
614
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100615 len = _PyUnicode_WSTR_LENGTH(unicode);
616 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100617 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200618 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100619 }
620
621 if (len == 1) {
622 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100623 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100624 Py_DECREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200625 return get_latin1_char((unsigned char)ch);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100626 }
627 }
628
629 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200630 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100631 return NULL;
632 }
633#else
Victor Stinneraa771272012-10-04 02:32:58 +0200634 assert(Py_REFCNT(unicode) == 1);
635
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100636 /* don't make the result ready in debug mode to ensure that the caller
637 makes the string ready before using it */
638 assert(_PyUnicode_CheckConsistency(unicode, 1));
639#endif
640 return unicode;
641}
642
643static PyObject*
644unicode_result_ready(PyObject *unicode)
645{
646 Py_ssize_t length;
647
648 length = PyUnicode_GET_LENGTH(unicode);
649 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200650 PyObject *empty = unicode_get_empty();
651 if (unicode != empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100652 Py_DECREF(unicode);
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200653 Py_INCREF(empty);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100654 }
Victor Stinner90ed8a62020-06-24 00:34:07 +0200655 return empty;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100656 }
657
658 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200659 int kind = PyUnicode_KIND(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200660 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakac43317d2021-06-12 20:44:32 +0300661 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200662 Py_UCS1 ch = data[0];
663 struct _Py_unicode_state *state = get_unicode_state();
664 PyObject *latin1_char = state->latin1[ch];
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100665 if (latin1_char != NULL) {
666 if (unicode != latin1_char) {
667 Py_INCREF(latin1_char);
668 Py_DECREF(unicode);
669 }
670 return latin1_char;
671 }
672 else {
673 assert(_PyUnicode_CheckConsistency(unicode, 1));
674 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +0200675 state->latin1[ch] = unicode;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100676 return unicode;
677 }
678 }
Victor Stinner2f9ada92020-06-24 02:22:21 +0200679 else {
680 assert(PyUnicode_READ_CHAR(unicode, 0) >= 256);
681 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100682 }
683
684 assert(_PyUnicode_CheckConsistency(unicode, 1));
685 return unicode;
686}
687
688static PyObject*
689unicode_result(PyObject *unicode)
690{
691 assert(_PyUnicode_CHECK(unicode));
692 if (PyUnicode_IS_READY(unicode))
693 return unicode_result_ready(unicode);
694 else
695 return unicode_result_wchar(unicode);
696}
697
Victor Stinnerc4b49542011-12-11 22:44:26 +0100698static PyObject*
699unicode_result_unchanged(PyObject *unicode)
700{
701 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500702 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100703 return NULL;
704 Py_INCREF(unicode);
705 return unicode;
706 }
707 else
708 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100709 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100710}
711
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200712/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
713 ASCII, Latin1, UTF-8, etc. */
714static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200715backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200716 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
717{
Victor Stinnerad771582015-10-09 12:38:53 +0200718 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200719 Py_UCS4 ch;
720 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300721 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200722
723 assert(PyUnicode_IS_READY(unicode));
724 kind = PyUnicode_KIND(unicode);
725 data = PyUnicode_DATA(unicode);
726
727 size = 0;
728 /* determine replacement size */
729 for (i = collstart; i < collend; ++i) {
730 Py_ssize_t incr;
731
732 ch = PyUnicode_READ(kind, data, i);
733 if (ch < 0x100)
734 incr = 2+2;
735 else if (ch < 0x10000)
736 incr = 2+4;
737 else {
738 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200739 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200740 }
741 if (size > PY_SSIZE_T_MAX - incr) {
742 PyErr_SetString(PyExc_OverflowError,
743 "encoded result is too long for a Python string");
744 return NULL;
745 }
746 size += incr;
747 }
748
Victor Stinnerad771582015-10-09 12:38:53 +0200749 str = _PyBytesWriter_Prepare(writer, str, size);
750 if (str == NULL)
751 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200752
753 /* generate replacement */
754 for (i = collstart; i < collend; ++i) {
755 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200756 *str++ = '\\';
757 if (ch >= 0x00010000) {
758 *str++ = 'U';
759 *str++ = Py_hexdigits[(ch>>28)&0xf];
760 *str++ = Py_hexdigits[(ch>>24)&0xf];
761 *str++ = Py_hexdigits[(ch>>20)&0xf];
762 *str++ = Py_hexdigits[(ch>>16)&0xf];
763 *str++ = Py_hexdigits[(ch>>12)&0xf];
764 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200765 }
Victor Stinner797485e2015-10-09 03:17:30 +0200766 else if (ch >= 0x100) {
767 *str++ = 'u';
768 *str++ = Py_hexdigits[(ch>>12)&0xf];
769 *str++ = Py_hexdigits[(ch>>8)&0xf];
770 }
771 else
772 *str++ = 'x';
773 *str++ = Py_hexdigits[(ch>>4)&0xf];
774 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200775 }
776 return str;
777}
778
779/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
780 ASCII, Latin1, UTF-8, etc. */
781static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200782xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200783 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
784{
Victor Stinnerad771582015-10-09 12:38:53 +0200785 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200786 Py_UCS4 ch;
787 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300788 const void *data;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200789
790 assert(PyUnicode_IS_READY(unicode));
791 kind = PyUnicode_KIND(unicode);
792 data = PyUnicode_DATA(unicode);
793
794 size = 0;
795 /* determine replacement size */
796 for (i = collstart; i < collend; ++i) {
797 Py_ssize_t incr;
798
799 ch = PyUnicode_READ(kind, data, i);
800 if (ch < 10)
801 incr = 2+1+1;
802 else if (ch < 100)
803 incr = 2+2+1;
804 else if (ch < 1000)
805 incr = 2+3+1;
806 else if (ch < 10000)
807 incr = 2+4+1;
808 else if (ch < 100000)
809 incr = 2+5+1;
810 else if (ch < 1000000)
811 incr = 2+6+1;
812 else {
813 assert(ch <= MAX_UNICODE);
814 incr = 2+7+1;
815 }
816 if (size > PY_SSIZE_T_MAX - incr) {
817 PyErr_SetString(PyExc_OverflowError,
818 "encoded result is too long for a Python string");
819 return NULL;
820 }
821 size += incr;
822 }
823
Victor Stinnerad771582015-10-09 12:38:53 +0200824 str = _PyBytesWriter_Prepare(writer, str, size);
825 if (str == NULL)
826 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200827
828 /* generate replacement */
829 for (i = collstart; i < collend; ++i) {
Christian Heimes07f2ade2020-11-18 16:38:53 +0100830 size = sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
831 if (size < 0) {
832 return NULL;
833 }
834 str += size;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200835 }
836 return str;
837}
838
Thomas Wouters477c8d52006-05-27 19:21:47 +0000839/* --- Bloom Filters ----------------------------------------------------- */
840
841/* stuff to implement simple "bloom filters" for Unicode characters.
842 to keep things simple, we use a single bitmask, using the least 5
843 bits from each unicode characters as the bit index. */
844
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200845/* the linebreak mask is set up by _PyUnicode_Init() below */
Thomas Wouters477c8d52006-05-27 19:21:47 +0000846
Antoine Pitrouf068f942010-01-13 14:19:12 +0000847#if LONG_BIT >= 128
848#define BLOOM_WIDTH 128
849#elif LONG_BIT >= 64
850#define BLOOM_WIDTH 64
851#elif LONG_BIT >= 32
852#define BLOOM_WIDTH 32
853#else
854#error "LONG_BIT is smaller than 32"
855#endif
856
Thomas Wouters477c8d52006-05-27 19:21:47 +0000857#define BLOOM_MASK unsigned long
858
Serhiy Storchaka05997252013-01-26 12:14:02 +0200859static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000860
Antoine Pitrouf068f942010-01-13 14:19:12 +0000861#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000862
Benjamin Peterson29060642009-01-31 22:14:21 +0000863#define BLOOM_LINEBREAK(ch) \
864 ((ch) < 128U ? ascii_linebreak[(ch)] : \
865 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000866
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700867static inline BLOOM_MASK
Serhiy Storchakacd8295f2020-04-11 10:48:40 +0300868make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000869{
Victor Stinnera85af502013-04-09 21:53:54 +0200870#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
871 do { \
872 TYPE *data = (TYPE *)PTR; \
873 TYPE *end = data + LEN; \
874 Py_UCS4 ch; \
875 for (; data != end; data++) { \
876 ch = *data; \
877 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
878 } \
879 break; \
880 } while (0)
881
Thomas Wouters477c8d52006-05-27 19:21:47 +0000882 /* calculate simple bloom-style bitmask for a given unicode string */
883
Antoine Pitrouf068f942010-01-13 14:19:12 +0000884 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000885
886 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200887 switch (kind) {
888 case PyUnicode_1BYTE_KIND:
889 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
890 break;
891 case PyUnicode_2BYTE_KIND:
892 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
893 break;
894 case PyUnicode_4BYTE_KIND:
895 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
896 break;
897 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700898 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200899 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000900 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200901
902#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000903}
904
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300905static int
906ensure_unicode(PyObject *obj)
907{
908 if (!PyUnicode_Check(obj)) {
909 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200910 "must be str, not %.100s",
911 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300912 return -1;
913 }
914 return PyUnicode_READY(obj);
915}
916
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200917/* Compilation of templated routines */
918
Victor Stinner90ed8a62020-06-24 00:34:07 +0200919#define STRINGLIB_GET_EMPTY() unicode_get_empty()
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200920
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200921#include "stringlib/asciilib.h"
922#include "stringlib/fastsearch.h"
923#include "stringlib/partition.h"
924#include "stringlib/split.h"
925#include "stringlib/count.h"
926#include "stringlib/find.h"
927#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200928#include "stringlib/undef.h"
929
930#include "stringlib/ucs1lib.h"
931#include "stringlib/fastsearch.h"
932#include "stringlib/partition.h"
933#include "stringlib/split.h"
934#include "stringlib/count.h"
935#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300936#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200937#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200938#include "stringlib/undef.h"
939
940#include "stringlib/ucs2lib.h"
941#include "stringlib/fastsearch.h"
942#include "stringlib/partition.h"
943#include "stringlib/split.h"
944#include "stringlib/count.h"
945#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300946#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200947#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200948#include "stringlib/undef.h"
949
950#include "stringlib/ucs4lib.h"
951#include "stringlib/fastsearch.h"
952#include "stringlib/partition.h"
953#include "stringlib/split.h"
954#include "stringlib/count.h"
955#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300956#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200957#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200958#include "stringlib/undef.h"
959
Inada Naoki2c4928d2020-06-17 20:09:44 +0900960_Py_COMP_DIAG_PUSH
961_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200962#include "stringlib/unicodedefs.h"
963#include "stringlib/fastsearch.h"
964#include "stringlib/count.h"
965#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100966#include "stringlib/undef.h"
Inada Naoki2c4928d2020-06-17 20:09:44 +0900967_Py_COMP_DIAG_POP
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200968
Victor Stinnerf363d0a2020-06-24 00:10:40 +0200969#undef STRINGLIB_GET_EMPTY
970
Guido van Rossumd57fd912000-03-10 22:53:23 +0000971/* --- Unicode Object ----------------------------------------------------- */
972
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700973static inline Py_ssize_t
974findchar(const void *s, int kind,
975 Py_ssize_t size, Py_UCS4 ch,
976 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200977{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200978 switch (kind) {
979 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200980 if ((Py_UCS1) ch != ch)
981 return -1;
982 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600983 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200984 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600985 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200986 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200987 if ((Py_UCS2) ch != ch)
988 return -1;
989 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600990 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200991 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600992 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200993 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200994 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600995 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200996 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600997 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200998 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700999 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +02001000 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001001}
1002
Victor Stinnerafffce42012-10-03 23:03:17 +02001003#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +00001004/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +02001005 earlier.
1006
1007 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
1008 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1009 invalid character in Unicode 6.0. */
1010static void
1011unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1012{
1013 int kind = PyUnicode_KIND(unicode);
1014 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1015 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1016 if (length <= old_length)
1017 return;
1018 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1019}
1020#endif
1021
Victor Stinnerfe226c02011-10-03 03:52:20 +02001022static PyObject*
1023resize_compact(PyObject *unicode, Py_ssize_t length)
1024{
1025 Py_ssize_t char_size;
1026 Py_ssize_t struct_size;
1027 Py_ssize_t new_size;
1028 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001029 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001030#ifdef Py_DEBUG
1031 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1032#endif
1033
Victor Stinner79891572012-05-03 13:43:07 +02001034 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001035 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001036 assert(PyUnicode_IS_COMPACT(unicode));
1037
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001038 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001039 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001040 struct_size = sizeof(PyASCIIObject);
1041 else
1042 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001043 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001044
Victor Stinnerfe226c02011-10-03 03:52:20 +02001045 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1046 PyErr_NoMemory();
1047 return NULL;
1048 }
1049 new_size = (struct_size + (length + 1) * char_size);
1050
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001051 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001052 PyObject_Free(_PyUnicode_UTF8(unicode));
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001053 _PyUnicode_UTF8(unicode) = NULL;
1054 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1055 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001056#ifdef Py_REF_DEBUG
1057 _Py_RefTotal--;
1058#endif
1059#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001060 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001061#endif
Victor Stinner84def372011-12-11 20:04:56 +01001062
Victor Stinner32bd68c2020-12-01 10:37:39 +01001063 new_unicode = (PyObject *)PyObject_Realloc(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001064 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001065 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001066 PyErr_NoMemory();
1067 return NULL;
1068 }
Victor Stinner84def372011-12-11 20:04:56 +01001069 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001070 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001071
Victor Stinnerfe226c02011-10-03 03:52:20 +02001072 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001073 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001074 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001075 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001076 _PyUnicode_WSTR_LENGTH(unicode) = length;
1077 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001078 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001079 PyObject_Free(_PyUnicode_WSTR(unicode));
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001080 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001081 if (!PyUnicode_IS_ASCII(unicode))
1082 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001083 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001084#ifdef Py_DEBUG
1085 unicode_fill_invalid(unicode, old_length);
1086#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001087 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1088 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001089 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001090 return unicode;
1091}
1092
Alexander Belopolsky40018472011-02-26 01:02:56 +00001093static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001094resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095{
Victor Stinner95663112011-10-04 01:03:50 +02001096 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001097 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001098 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001099 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001100
Victor Stinnerfe226c02011-10-03 03:52:20 +02001101 if (PyUnicode_IS_READY(unicode)) {
1102 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001103 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001104 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001105#ifdef Py_DEBUG
1106 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1107#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001108
1109 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001110 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001111 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1112 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001113
1114 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1115 PyErr_NoMemory();
1116 return -1;
1117 }
1118 new_size = (length + 1) * char_size;
1119
Victor Stinner7a9105a2011-12-12 00:13:42 +01001120 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1121 {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001122 PyObject_Free(_PyUnicode_UTF8(unicode));
Victor Stinner7a9105a2011-12-12 00:13:42 +01001123 _PyUnicode_UTF8(unicode) = NULL;
1124 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1125 }
1126
Victor Stinner32bd68c2020-12-01 10:37:39 +01001127 data = (PyObject *)PyObject_Realloc(data, new_size);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001128 if (data == NULL) {
1129 PyErr_NoMemory();
1130 return -1;
1131 }
1132 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001133 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001134 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001135 _PyUnicode_WSTR_LENGTH(unicode) = length;
1136 }
1137 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001138 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001139 _PyUnicode_UTF8_LENGTH(unicode) = length;
1140 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001141 _PyUnicode_LENGTH(unicode) = length;
1142 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001143#ifdef Py_DEBUG
1144 unicode_fill_invalid(unicode, old_length);
1145#endif
Victor Stinner95663112011-10-04 01:03:50 +02001146 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001147 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001148 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001149 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001150 }
Victor Stinner95663112011-10-04 01:03:50 +02001151 assert(_PyUnicode_WSTR(unicode) != NULL);
1152
1153 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001154 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001155 PyErr_NoMemory();
1156 return -1;
1157 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001158 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001159 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner32bd68c2020-12-01 10:37:39 +01001160 wstr = PyObject_Realloc(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001161 if (!wstr) {
1162 PyErr_NoMemory();
1163 return -1;
1164 }
1165 _PyUnicode_WSTR(unicode) = wstr;
1166 _PyUnicode_WSTR(unicode)[length] = 0;
1167 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001168 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169 return 0;
1170}
1171
Victor Stinnerfe226c02011-10-03 03:52:20 +02001172static PyObject*
1173resize_copy(PyObject *unicode, Py_ssize_t length)
1174{
1175 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001176 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001177 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001178
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001179 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001180
1181 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1182 if (copy == NULL)
1183 return NULL;
1184
1185 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001186 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001187 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001188 }
1189 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001190 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001191
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001192 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001193 if (w == NULL)
1194 return NULL;
1195 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1196 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001197 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001198 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001199 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001200 }
1201}
1202
Guido van Rossumd57fd912000-03-10 22:53:23 +00001203/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001204 Ux0000 terminated; some code (e.g. new_identifier)
1205 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001206
1207 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001208 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209
1210*/
1211
Alexander Belopolsky40018472011-02-26 01:02:56 +00001212static PyUnicodeObject *
1213_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001215 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001216 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217
Thomas Wouters477c8d52006-05-27 19:21:47 +00001218 /* Optimization for empty strings */
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001219 if (length == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001220 return (PyUnicodeObject *)unicode_new_empty();
Guido van Rossumd57fd912000-03-10 22:53:23 +00001221 }
1222
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001223 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001224 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001225 return (PyUnicodeObject *)PyErr_NoMemory();
1226 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001227 if (length < 0) {
1228 PyErr_SetString(PyExc_SystemError,
1229 "Negative size passed to _PyUnicode_New");
1230 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001231 }
1232
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001233 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1234 if (unicode == NULL)
1235 return NULL;
1236 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001237
1238 _PyUnicode_WSTR_LENGTH(unicode) = length;
1239 _PyUnicode_HASH(unicode) = -1;
1240 _PyUnicode_STATE(unicode).interned = 0;
1241 _PyUnicode_STATE(unicode).kind = 0;
1242 _PyUnicode_STATE(unicode).compact = 0;
1243 _PyUnicode_STATE(unicode).ready = 0;
1244 _PyUnicode_STATE(unicode).ascii = 0;
1245 _PyUnicode_DATA_ANY(unicode) = NULL;
1246 _PyUnicode_LENGTH(unicode) = 0;
1247 _PyUnicode_UTF8(unicode) = NULL;
1248 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1249
Victor Stinner32bd68c2020-12-01 10:37:39 +01001250 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_Malloc(new_size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001251 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001252 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001253 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001254 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001255 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001256
Jeremy Hyltond8082792003-09-16 19:41:39 +00001257 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001258 * the caller fails before initializing str -- unicode_resize()
1259 * reads str[0], and the Keep-Alive optimization can keep memory
1260 * allocated for str alive across a call to unicode_dealloc(unicode).
1261 * We don't want unicode_resize to read uninitialized memory in
1262 * that case.
1263 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001264 _PyUnicode_WSTR(unicode)[0] = 0;
1265 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001266
Victor Stinner7931d9a2011-11-04 00:22:48 +01001267 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001268 return unicode;
1269}
1270
Victor Stinnerf42dc442011-10-02 23:33:16 +02001271static const char*
1272unicode_kind_name(PyObject *unicode)
1273{
Victor Stinner42dfd712011-10-03 14:41:45 +02001274 /* don't check consistency: unicode_kind_name() is called from
1275 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001276 if (!PyUnicode_IS_COMPACT(unicode))
1277 {
1278 if (!PyUnicode_IS_READY(unicode))
1279 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001280 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001281 {
1282 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001283 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001284 return "legacy ascii";
1285 else
1286 return "legacy latin1";
1287 case PyUnicode_2BYTE_KIND:
1288 return "legacy UCS2";
1289 case PyUnicode_4BYTE_KIND:
1290 return "legacy UCS4";
1291 default:
1292 return "<legacy invalid kind>";
1293 }
1294 }
1295 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001296 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001297 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001298 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001299 return "ascii";
1300 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001301 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001302 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001303 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001304 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001305 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001306 default:
1307 return "<invalid compact kind>";
1308 }
1309}
1310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001311#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312/* Functions wrapping macros for use in debugger */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001313const char *_PyUnicode_utf8(void *unicode_raw){
Victor Stinnera42de742018-11-22 10:25:22 +01001314 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001315 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001316}
1317
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001318const void *_PyUnicode_compact_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001319 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 return _PyUnicode_COMPACT_DATA(unicode);
1321}
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001322const void *_PyUnicode_data(void *unicode_raw) {
Victor Stinnera42de742018-11-22 10:25:22 +01001323 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001324 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001325 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1326 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1327 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1328 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1329 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1330 return PyUnicode_DATA(unicode);
1331}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001332
1333void
1334_PyUnicode_Dump(PyObject *op)
1335{
1336 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001337 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1338 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001339 const void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001340
Victor Stinnera849a4b2011-10-03 12:12:11 +02001341 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001342 {
1343 if (ascii->state.ascii)
1344 data = (ascii + 1);
1345 else
1346 data = (compact + 1);
1347 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001348 else
1349 data = unicode->data.any;
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001350 printf("%s: len=%zu, ", unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001351
Victor Stinnera849a4b2011-10-03 12:12:11 +02001352 if (ascii->wstr == data)
1353 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001354 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001355
Victor Stinnera3b334d2011-10-03 13:53:37 +02001356 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001357 printf(" (%zu), ", compact->wstr_length);
1358 if (!ascii->state.compact && compact->utf8 == unicode->data.any) {
Victor Stinnera849a4b2011-10-03 12:12:11 +02001359 printf("shared ");
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02001360 }
1361 printf("utf8=%p (%zu)", (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001362 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001363 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001364}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001365#endif
1366
Victor Stinner91698d82020-06-25 14:07:40 +02001367static int
1368unicode_create_empty_string_singleton(struct _Py_unicode_state *state)
1369{
1370 // Use size=1 rather than size=0, so PyUnicode_New(0, maxchar) can be
1371 // optimized to always use state->empty_string without having to check if
1372 // it is NULL or not.
1373 PyObject *empty = PyUnicode_New(1, 0);
1374 if (empty == NULL) {
1375 return -1;
1376 }
1377 PyUnicode_1BYTE_DATA(empty)[0] = 0;
1378 _PyUnicode_LENGTH(empty) = 0;
1379 assert(_PyUnicode_CheckConsistency(empty, 1));
1380
1381 assert(state->empty_string == NULL);
1382 state->empty_string = empty;
1383 return 0;
1384}
1385
1386
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001387PyObject *
1388PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1389{
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001390 /* Optimization for empty strings */
1391 if (size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +02001392 return unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02001393 }
1394
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395 PyObject *obj;
1396 PyCompactUnicodeObject *unicode;
1397 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001398 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001399 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001400 Py_ssize_t char_size;
1401 Py_ssize_t struct_size;
1402
Victor Stinner9e9d6892011-10-04 01:02:02 +02001403 is_ascii = 0;
1404 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001405 struct_size = sizeof(PyCompactUnicodeObject);
1406 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001407 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001408 char_size = 1;
1409 is_ascii = 1;
1410 struct_size = sizeof(PyASCIIObject);
1411 }
1412 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001413 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001414 char_size = 1;
1415 }
1416 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001417 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001418 char_size = 2;
1419 if (sizeof(wchar_t) == 2)
1420 is_sharing = 1;
1421 }
1422 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001423 if (maxchar > MAX_UNICODE) {
1424 PyErr_SetString(PyExc_SystemError,
1425 "invalid maximum character passed to PyUnicode_New");
1426 return NULL;
1427 }
Victor Stinner8f825062012-04-27 13:55:39 +02001428 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001429 char_size = 4;
1430 if (sizeof(wchar_t) == 4)
1431 is_sharing = 1;
1432 }
1433
1434 /* Ensure we won't overflow the size. */
1435 if (size < 0) {
1436 PyErr_SetString(PyExc_SystemError,
1437 "Negative size passed to PyUnicode_New");
1438 return NULL;
1439 }
1440 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1441 return PyErr_NoMemory();
1442
1443 /* Duplicated allocation code from _PyObject_New() instead of a call to
1444 * PyObject_New() so we are able to allocate space for the object and
1445 * it's data buffer.
1446 */
Victor Stinner32bd68c2020-12-01 10:37:39 +01001447 obj = (PyObject *) PyObject_Malloc(struct_size + (size + 1) * char_size);
Victor Stinner04fc4f22020-06-16 01:28:07 +02001448 if (obj == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001449 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02001450 }
1451 _PyObject_Init(obj, &PyUnicode_Type);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001452
1453 unicode = (PyCompactUnicodeObject *)obj;
1454 if (is_ascii)
1455 data = ((PyASCIIObject*)obj) + 1;
1456 else
1457 data = unicode + 1;
1458 _PyUnicode_LENGTH(unicode) = size;
1459 _PyUnicode_HASH(unicode) = -1;
1460 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001461 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001462 _PyUnicode_STATE(unicode).compact = 1;
1463 _PyUnicode_STATE(unicode).ready = 1;
1464 _PyUnicode_STATE(unicode).ascii = is_ascii;
1465 if (is_ascii) {
1466 ((char*)data)[size] = 0;
1467 _PyUnicode_WSTR(unicode) = NULL;
1468 }
Victor Stinner8f825062012-04-27 13:55:39 +02001469 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470 ((char*)data)[size] = 0;
1471 _PyUnicode_WSTR(unicode) = NULL;
1472 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001474 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001475 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476 else {
1477 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001478 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001479 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001480 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001481 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001482 ((Py_UCS4*)data)[size] = 0;
1483 if (is_sharing) {
1484 _PyUnicode_WSTR_LENGTH(unicode) = size;
1485 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1486 }
1487 else {
1488 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1489 _PyUnicode_WSTR(unicode) = NULL;
1490 }
1491 }
Victor Stinner8f825062012-04-27 13:55:39 +02001492#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001493 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001494#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001495 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001496 return obj;
1497}
1498
1499#if SIZEOF_WCHAR_T == 2
1500/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1501 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001502 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001503
1504 This function assumes that unicode can hold one more code point than wstr
1505 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001506static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001507unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001508 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001509{
1510 const wchar_t *iter;
1511 Py_UCS4 *ucs4_out;
1512
Victor Stinner910337b2011-10-03 03:20:16 +02001513 assert(unicode != NULL);
1514 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001515 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1516 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1517
1518 for (iter = begin; iter < end; ) {
1519 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1520 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001521 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1522 && (iter+1) < end
1523 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001524 {
Victor Stinner551ac952011-11-29 22:58:13 +01001525 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001526 iter += 2;
1527 }
1528 else {
1529 *ucs4_out++ = *iter;
1530 iter++;
1531 }
1532 }
1533 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1534 _PyUnicode_GET_LENGTH(unicode)));
1535
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001536}
1537#endif
1538
Victor Stinnercd9950f2011-10-02 00:34:53 +02001539static int
Victor Stinner488fa492011-12-12 00:01:39 +01001540unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001541{
Victor Stinner488fa492011-12-12 00:01:39 +01001542 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001543 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001544 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001545 return -1;
1546 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001547 return 0;
1548}
1549
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001550static int
1551_copy_characters(PyObject *to, Py_ssize_t to_start,
1552 PyObject *from, Py_ssize_t from_start,
1553 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001554{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001555 unsigned int from_kind, to_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001556 const void *from_data;
1557 void *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001558
Victor Stinneree4544c2012-05-09 22:24:08 +02001559 assert(0 <= how_many);
1560 assert(0 <= from_start);
1561 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001562 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001563 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001564 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001565
Victor Stinnerd3f08822012-05-29 12:57:52 +02001566 assert(PyUnicode_Check(to));
1567 assert(PyUnicode_IS_READY(to));
1568 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1569
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001570 if (how_many == 0)
1571 return 0;
1572
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001573 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001574 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001575 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001576 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001577
Victor Stinnerf1852262012-06-16 16:38:26 +02001578#ifdef Py_DEBUG
1579 if (!check_maxchar
1580 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1581 {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001582 Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerf1852262012-06-16 16:38:26 +02001583 Py_UCS4 ch;
1584 Py_ssize_t i;
1585 for (i=0; i < how_many; i++) {
1586 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1587 assert(ch <= to_maxchar);
1588 }
1589 }
1590#endif
1591
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001592 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001593 if (check_maxchar
1594 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1595 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001596 /* Writing Latin-1 characters into an ASCII string requires to
1597 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001598 Py_UCS4 max_char;
1599 max_char = ucs1lib_find_max_char(from_data,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001600 (const Py_UCS1*)from_data + how_many);
Victor Stinnerf1852262012-06-16 16:38:26 +02001601 if (max_char >= 128)
1602 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001603 }
Christian Heimesf051e432016-09-13 20:22:02 +02001604 memcpy((char*)to_data + to_kind * to_start,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03001605 (const char*)from_data + from_kind * from_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001606 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001607 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001608 else if (from_kind == PyUnicode_1BYTE_KIND
1609 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001610 {
1611 _PyUnicode_CONVERT_BYTES(
1612 Py_UCS1, Py_UCS2,
1613 PyUnicode_1BYTE_DATA(from) + from_start,
1614 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1615 PyUnicode_2BYTE_DATA(to) + to_start
1616 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001617 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001618 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001619 && to_kind == PyUnicode_4BYTE_KIND)
1620 {
1621 _PyUnicode_CONVERT_BYTES(
1622 Py_UCS1, Py_UCS4,
1623 PyUnicode_1BYTE_DATA(from) + from_start,
1624 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1625 PyUnicode_4BYTE_DATA(to) + to_start
1626 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001627 }
1628 else if (from_kind == PyUnicode_2BYTE_KIND
1629 && to_kind == PyUnicode_4BYTE_KIND)
1630 {
1631 _PyUnicode_CONVERT_BYTES(
1632 Py_UCS2, Py_UCS4,
1633 PyUnicode_2BYTE_DATA(from) + from_start,
1634 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1635 PyUnicode_4BYTE_DATA(to) + to_start
1636 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001637 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001638 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001639 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1640
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001641 if (!check_maxchar) {
1642 if (from_kind == PyUnicode_2BYTE_KIND
1643 && to_kind == PyUnicode_1BYTE_KIND)
1644 {
1645 _PyUnicode_CONVERT_BYTES(
1646 Py_UCS2, Py_UCS1,
1647 PyUnicode_2BYTE_DATA(from) + from_start,
1648 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1649 PyUnicode_1BYTE_DATA(to) + to_start
1650 );
1651 }
1652 else if (from_kind == PyUnicode_4BYTE_KIND
1653 && to_kind == PyUnicode_1BYTE_KIND)
1654 {
1655 _PyUnicode_CONVERT_BYTES(
1656 Py_UCS4, Py_UCS1,
1657 PyUnicode_4BYTE_DATA(from) + from_start,
1658 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1659 PyUnicode_1BYTE_DATA(to) + to_start
1660 );
1661 }
1662 else if (from_kind == PyUnicode_4BYTE_KIND
1663 && to_kind == PyUnicode_2BYTE_KIND)
1664 {
1665 _PyUnicode_CONVERT_BYTES(
1666 Py_UCS4, Py_UCS2,
1667 PyUnicode_4BYTE_DATA(from) + from_start,
1668 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1669 PyUnicode_2BYTE_DATA(to) + to_start
1670 );
1671 }
1672 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001673 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001674 }
1675 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001676 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001677 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001678 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001679 Py_ssize_t i;
1680
Victor Stinnera0702ab2011-09-29 14:14:38 +02001681 for (i=0; i < how_many; i++) {
1682 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001683 if (ch > to_maxchar)
1684 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001685 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1686 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001687 }
1688 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001689 return 0;
1690}
1691
Victor Stinnerd3f08822012-05-29 12:57:52 +02001692void
1693_PyUnicode_FastCopyCharacters(
1694 PyObject *to, Py_ssize_t to_start,
1695 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001696{
1697 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1698}
1699
1700Py_ssize_t
1701PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1702 PyObject *from, Py_ssize_t from_start,
1703 Py_ssize_t how_many)
1704{
1705 int err;
1706
1707 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1708 PyErr_BadInternalCall();
1709 return -1;
1710 }
1711
Benjamin Petersonbac79492012-01-14 13:34:47 -05001712 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001713 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001714 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001715 return -1;
1716
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001717 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001718 PyErr_SetString(PyExc_IndexError, "string index out of range");
1719 return -1;
1720 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001721 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001722 PyErr_SetString(PyExc_IndexError, "string index out of range");
1723 return -1;
1724 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001725 if (how_many < 0) {
1726 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1727 return -1;
1728 }
1729 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001730 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1731 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001732 "Cannot write %zi characters at %zi "
1733 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001734 how_many, to_start, PyUnicode_GET_LENGTH(to));
1735 return -1;
1736 }
1737
1738 if (how_many == 0)
1739 return 0;
1740
Victor Stinner488fa492011-12-12 00:01:39 +01001741 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001742 return -1;
1743
1744 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1745 if (err) {
1746 PyErr_Format(PyExc_SystemError,
1747 "Cannot copy %s characters "
1748 "into a string of %s characters",
1749 unicode_kind_name(from),
1750 unicode_kind_name(to));
1751 return -1;
1752 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001753 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001754}
1755
Victor Stinner17222162011-09-28 22:15:37 +02001756/* Find the maximum code point and count the number of surrogate pairs so a
1757 correct string length can be computed before converting a string to UCS4.
1758 This function counts single surrogates as a character and not as a pair.
1759
1760 Return 0 on success, or -1 on error. */
1761static int
1762find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1763 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001764{
1765 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001766 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001767
Victor Stinnerc53be962011-10-02 21:33:54 +02001768 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001769 *num_surrogates = 0;
1770 *maxchar = 0;
1771
1772 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001773#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001774 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1775 && (iter+1) < end
1776 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1777 {
1778 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1779 ++(*num_surrogates);
1780 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001781 }
1782 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001783#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001784 {
1785 ch = *iter;
1786 iter++;
1787 }
1788 if (ch > *maxchar) {
1789 *maxchar = ch;
1790 if (*maxchar > MAX_UNICODE) {
1791 PyErr_Format(PyExc_ValueError,
Victor Stinner99768342021-03-17 21:46:53 +01001792 "character U+%x is not in range [U+0000; U+%x]",
1793 ch, MAX_UNICODE);
Victor Stinner8faf8212011-12-08 22:14:11 +01001794 return -1;
1795 }
1796 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001797 }
1798 return 0;
1799}
1800
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001801int
1802_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001803{
1804 wchar_t *end;
1805 Py_UCS4 maxchar = 0;
1806 Py_ssize_t num_surrogates;
1807#if SIZEOF_WCHAR_T == 2
1808 Py_ssize_t length_wo_surrogates;
1809#endif
1810
Georg Brandl7597add2011-10-05 16:36:47 +02001811 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001812 strings were created using _PyObject_New() and where no canonical
1813 representation (the str field) has been set yet aka strings
1814 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001815 assert(_PyUnicode_CHECK(unicode));
1816 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001817 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001818 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001819 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001820 /* Actually, it should neither be interned nor be anything else: */
1821 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001822
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001824 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001825 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001826 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001827
1828 if (maxchar < 256) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001829 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(_PyUnicode_WSTR_LENGTH(unicode) + 1);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001830 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001831 PyErr_NoMemory();
1832 return -1;
1833 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001834 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001835 _PyUnicode_WSTR(unicode), end,
1836 PyUnicode_1BYTE_DATA(unicode));
1837 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1838 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1839 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1840 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001841 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001842 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001843 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001844 }
1845 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001846 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001847 _PyUnicode_UTF8(unicode) = NULL;
1848 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001849 }
Victor Stinner32bd68c2020-12-01 10:37:39 +01001850 PyObject_Free(_PyUnicode_WSTR(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001851 _PyUnicode_WSTR(unicode) = NULL;
1852 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1853 }
1854 /* In this case we might have to convert down from 4-byte native
1855 wchar_t to 2-byte unicode. */
1856 else if (maxchar < 65536) {
1857 assert(num_surrogates == 0 &&
1858 "FindMaxCharAndNumSurrogatePairs() messed up");
1859
Victor Stinner506f5922011-09-28 22:34:18 +02001860#if SIZEOF_WCHAR_T == 2
1861 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001862 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001863 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1864 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1865 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001866 _PyUnicode_UTF8(unicode) = NULL;
1867 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001868#else
1869 /* sizeof(wchar_t) == 4 */
Victor Stinner32bd68c2020-12-01 10:37:39 +01001870 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(
Victor Stinner506f5922011-09-28 22:34:18 +02001871 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001872 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001873 PyErr_NoMemory();
1874 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001875 }
Victor Stinner506f5922011-09-28 22:34:18 +02001876 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1877 _PyUnicode_WSTR(unicode), end,
1878 PyUnicode_2BYTE_DATA(unicode));
1879 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1880 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1881 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001882 _PyUnicode_UTF8(unicode) = NULL;
1883 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner32bd68c2020-12-01 10:37:39 +01001884 PyObject_Free(_PyUnicode_WSTR(unicode));
Victor Stinner506f5922011-09-28 22:34:18 +02001885 _PyUnicode_WSTR(unicode) = NULL;
1886 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1887#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001888 }
Ikko Ashimine38811d62020-11-10 14:57:34 +09001889 /* maxchar exceeds 16 bit, wee need 4 bytes for unicode characters */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001890 else {
1891#if SIZEOF_WCHAR_T == 2
1892 /* in case the native representation is 2-bytes, we need to allocate a
1893 new normalized 4-byte version. */
1894 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001895 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1896 PyErr_NoMemory();
1897 return -1;
1898 }
Victor Stinner32bd68c2020-12-01 10:37:39 +01001899 _PyUnicode_DATA_ANY(unicode) = PyObject_Malloc(4 * (length_wo_surrogates + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001900 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001901 PyErr_NoMemory();
1902 return -1;
1903 }
1904 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1905 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001906 _PyUnicode_UTF8(unicode) = NULL;
1907 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001908 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1909 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001910 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Victor Stinner32bd68c2020-12-01 10:37:39 +01001911 PyObject_Free(_PyUnicode_WSTR(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001912 _PyUnicode_WSTR(unicode) = NULL;
1913 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1914#else
1915 assert(num_surrogates == 0);
1916
Victor Stinnerc3c74152011-10-02 20:39:55 +02001917 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001918 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001919 _PyUnicode_UTF8(unicode) = NULL;
1920 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001921 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1922#endif
1923 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1924 }
1925 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001926 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001927 return 0;
1928}
1929
Alexander Belopolsky40018472011-02-26 01:02:56 +00001930static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001931unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932{
Walter Dörwald16807132007-05-25 13:52:07 +00001933 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001934 case SSTATE_NOT_INTERNED:
1935 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001936
Benjamin Peterson29060642009-01-31 22:14:21 +00001937 case SSTATE_INTERNED_MORTAL:
Victor Stinnerea251802020-12-26 02:58:33 +01001938 {
1939 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner3549ca32020-07-03 16:59:12 +02001940 /* Revive the dead object temporarily. PyDict_DelItem() removes two
1941 references (key and value) which were ignored by
1942 PyUnicode_InternInPlace(). Use refcnt=3 rather than refcnt=2
1943 to prevent calling unicode_dealloc() again. Adjust refcnt after
1944 PyDict_DelItem(). */
1945 assert(Py_REFCNT(unicode) == 0);
1946 Py_SET_REFCNT(unicode, 3);
Victor Stinnerea251802020-12-26 02:58:33 +01001947 if (PyDict_DelItem(state->interned, unicode) != 0) {
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001948 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1949 NULL);
1950 }
Victor Stinner3549ca32020-07-03 16:59:12 +02001951 assert(Py_REFCNT(unicode) == 1);
1952 Py_SET_REFCNT(unicode, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +00001953 break;
Victor Stinnerea251802020-12-26 02:58:33 +01001954 }
Walter Dörwald16807132007-05-25 13:52:07 +00001955
Benjamin Peterson29060642009-01-31 22:14:21 +00001956 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001957 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1958 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001959
Benjamin Peterson29060642009-01-31 22:14:21 +00001960 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001961 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001962 }
1963
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001964 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001965 PyObject_Free(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001966 }
1967 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001968 PyObject_Free(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001969 }
1970 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinner32bd68c2020-12-01 10:37:39 +01001971 PyObject_Free(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001972 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001973
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001974 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975}
1976
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001977#ifdef Py_DEBUG
1978static int
1979unicode_is_singleton(PyObject *unicode)
1980{
Victor Stinner2f9ada92020-06-24 02:22:21 +02001981 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner91698d82020-06-25 14:07:40 +02001982 if (unicode == state->empty_string) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001983 return 1;
Victor Stinner607b1022020-05-05 18:50:30 +02001984 }
Victor Stinner607b1022020-05-05 18:50:30 +02001985 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001986 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1987 {
1988 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02001989 if (ch < 256 && state->latin1[ch] == unicode) {
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001990 return 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02001991 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001992 }
1993 return 0;
1994}
1995#endif
1996
Alexander Belopolsky40018472011-02-26 01:02:56 +00001997static int
Victor Stinner488fa492011-12-12 00:01:39 +01001998unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001999{
Victor Stinner488fa492011-12-12 00:01:39 +01002000 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02002001 if (Py_REFCNT(unicode) != 1)
2002 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002003 if (_PyUnicode_HASH(unicode) != -1)
2004 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002005 if (PyUnicode_CHECK_INTERNED(unicode))
2006 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01002007 if (!PyUnicode_CheckExact(unicode))
2008 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02002009#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02002010 /* singleton refcount is greater than 1 */
2011 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02002012#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02002013 return 1;
2014}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002015
Victor Stinnerfe226c02011-10-03 03:52:20 +02002016static int
2017unicode_resize(PyObject **p_unicode, Py_ssize_t length)
2018{
2019 PyObject *unicode;
2020 Py_ssize_t old_length;
2021
2022 assert(p_unicode != NULL);
2023 unicode = *p_unicode;
2024
2025 assert(unicode != NULL);
2026 assert(PyUnicode_Check(unicode));
2027 assert(0 <= length);
2028
Victor Stinner910337b2011-10-03 03:20:16 +02002029 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002030 old_length = PyUnicode_WSTR_LENGTH(unicode);
2031 else
2032 old_length = PyUnicode_GET_LENGTH(unicode);
2033 if (old_length == length)
2034 return 0;
2035
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002036 if (length == 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002037 PyObject *empty = unicode_new_empty();
Victor Stinnerf363d0a2020-06-24 00:10:40 +02002038 Py_SETREF(*p_unicode, empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002039 return 0;
2040 }
2041
Victor Stinner488fa492011-12-12 00:01:39 +01002042 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002043 PyObject *copy = resize_copy(unicode, length);
2044 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002045 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002046 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002047 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002048 }
2049
Victor Stinnerfe226c02011-10-03 03:52:20 +02002050 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002051 PyObject *new_unicode = resize_compact(unicode, length);
2052 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002053 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002054 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002055 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002056 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002057 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002058}
2059
Alexander Belopolsky40018472011-02-26 01:02:56 +00002060int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002061PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002062{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002063 PyObject *unicode;
2064 if (p_unicode == NULL) {
2065 PyErr_BadInternalCall();
2066 return -1;
2067 }
2068 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002069 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002070 {
2071 PyErr_BadInternalCall();
2072 return -1;
2073 }
2074 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002075}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002076
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002077/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002078
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002079 WARNING: The function doesn't copy the terminating null character and
2080 doesn't check the maximum character (may write a latin1 character in an
2081 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002082static void
2083unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2084 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002085{
2086 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002087 const void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002088 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002089
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002090 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002091 switch (kind) {
2092 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002093#ifdef Py_DEBUG
2094 if (PyUnicode_IS_ASCII(unicode)) {
2095 Py_UCS4 maxchar = ucs1lib_find_max_char(
2096 (const Py_UCS1*)str,
2097 (const Py_UCS1*)str + len);
2098 assert(maxchar < 128);
2099 }
2100#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002101 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002102 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002103 }
2104 case PyUnicode_2BYTE_KIND: {
2105 Py_UCS2 *start = (Py_UCS2 *)data + index;
2106 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002107
Victor Stinner184252a2012-06-16 02:57:41 +02002108 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002109 *ucs2 = (Py_UCS2)*str;
2110
2111 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002112 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002113 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002114 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002115 Py_UCS4 *start = (Py_UCS4 *)data + index;
2116 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002117
Victor Stinner184252a2012-06-16 02:57:41 +02002118 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002119 *ucs4 = (Py_UCS4)*str;
2120
2121 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002122 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002123 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002124 default:
2125 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002126 }
2127}
2128
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129static PyObject*
Victor Stinner2f9ada92020-06-24 02:22:21 +02002130get_latin1_char(Py_UCS1 ch)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002131{
Victor Stinner2f9ada92020-06-24 02:22:21 +02002132 struct _Py_unicode_state *state = get_unicode_state();
Victor Stinner607b1022020-05-05 18:50:30 +02002133
Victor Stinner2f9ada92020-06-24 02:22:21 +02002134 PyObject *unicode = state->latin1[ch];
Victor Stinner607b1022020-05-05 18:50:30 +02002135 if (unicode) {
2136 Py_INCREF(unicode);
2137 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002138 }
Victor Stinner607b1022020-05-05 18:50:30 +02002139
2140 unicode = PyUnicode_New(1, ch);
2141 if (!unicode) {
2142 return NULL;
2143 }
2144
2145 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
2146 assert(_PyUnicode_CheckConsistency(unicode, 1));
2147
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002148 Py_INCREF(unicode);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002149 state->latin1[ch] = unicode;
Victor Stinnera464fc12011-10-02 20:39:30 +02002150 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002151}
2152
Victor Stinner985a82a2014-01-03 12:53:47 +01002153static PyObject*
2154unicode_char(Py_UCS4 ch)
2155{
2156 PyObject *unicode;
2157
2158 assert(ch <= MAX_UNICODE);
2159
Victor Stinner2f9ada92020-06-24 02:22:21 +02002160 if (ch < 256) {
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002161 return get_latin1_char(ch);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002162 }
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002163
Victor Stinner985a82a2014-01-03 12:53:47 +01002164 unicode = PyUnicode_New(1, ch);
2165 if (unicode == NULL)
2166 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002167
2168 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2169 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002170 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002171 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002172 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2173 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2174 }
2175 assert(_PyUnicode_CheckConsistency(unicode, 1));
2176 return unicode;
2177}
2178
Alexander Belopolsky40018472011-02-26 01:02:56 +00002179PyObject *
2180PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002181{
Inada Naoki038dd0f2020-06-30 15:26:56 +09002182 if (u == NULL) {
2183 if (size > 0) {
2184 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2185 "PyUnicode_FromUnicode(NULL, size) is deprecated; "
2186 "use PyUnicode_New() instead", 1) < 0) {
2187 return NULL;
2188 }
2189 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002190 return (PyObject*)_PyUnicode_New(size);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002191 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002192
2193 if (size < 0) {
2194 PyErr_BadInternalCall();
2195 return NULL;
2196 }
2197
2198 return PyUnicode_FromWideChar(u, size);
2199}
2200
2201PyObject *
2202PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2203{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002204 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002205 Py_UCS4 maxchar = 0;
2206 Py_ssize_t num_surrogates;
2207
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002208 if (u == NULL && size != 0) {
2209 PyErr_BadInternalCall();
2210 return NULL;
2211 }
2212
2213 if (size == -1) {
2214 size = wcslen(u);
2215 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002216
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002217 /* If the Unicode data is known at construction time, we can apply
2218 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002220 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002221 if (size == 0)
2222 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002223
Jakub Kulík9032cf52021-04-30 15:21:42 +02002224#ifdef HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
2225 /* Oracle Solaris uses non-Unicode internal wchar_t form for
2226 non-Unicode locales and hence needs conversion to UCS-4 first. */
2227 if (_Py_LocaleUsesNonUnicodeWchar()) {
2228 wchar_t* converted = _Py_DecodeNonUnicodeWchar(u, size);
2229 if (!converted) {
2230 return NULL;
2231 }
2232 PyObject *unicode = _PyUnicode_FromUCS4(converted, size);
2233 PyMem_Free(converted);
2234 return unicode;
2235 }
2236#endif
2237
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002238 /* Single character Unicode objects in the Latin-1 range are
2239 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002240 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002241 return get_latin1_char((unsigned char)*u);
2242
2243 /* If not empty and not single character, copy the Unicode data
2244 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002245 if (find_maxchar_surrogates(u, u + size,
2246 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002247 return NULL;
2248
Victor Stinner8faf8212011-12-08 22:14:11 +01002249 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002250 if (!unicode)
2251 return NULL;
2252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002253 switch (PyUnicode_KIND(unicode)) {
2254 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002255 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002256 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2257 break;
2258 case PyUnicode_2BYTE_KIND:
2259#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002260 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002261#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002262 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2264#endif
2265 break;
2266 case PyUnicode_4BYTE_KIND:
2267#if SIZEOF_WCHAR_T == 2
2268 /* This is the only case which has to process surrogates, thus
2269 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002270 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002271#else
2272 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002273 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002274#endif
2275 break;
2276 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002277 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002280 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002281}
2282
Alexander Belopolsky40018472011-02-26 01:02:56 +00002283PyObject *
2284PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002285{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002286 if (size < 0) {
2287 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002288 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002289 return NULL;
2290 }
Inada Naoki038dd0f2020-06-30 15:26:56 +09002291 if (u != NULL) {
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002292 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002293 }
2294 else {
2295 if (size > 0) {
2296 if (PyErr_WarnEx(PyExc_DeprecationWarning,
2297 "PyUnicode_FromStringAndSize(NULL, size) is deprecated; "
2298 "use PyUnicode_New() instead", 1) < 0) {
2299 return NULL;
2300 }
2301 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002302 return (PyObject *)_PyUnicode_New(size);
Inada Naoki038dd0f2020-06-30 15:26:56 +09002303 }
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002304}
2305
Alexander Belopolsky40018472011-02-26 01:02:56 +00002306PyObject *
2307PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002308{
2309 size_t size = strlen(u);
2310 if (size > PY_SSIZE_T_MAX) {
2311 PyErr_SetString(PyExc_OverflowError, "input too long");
2312 return NULL;
2313 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002314 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002315}
2316
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002317
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002318PyObject *
2319_PyUnicode_FromId(_Py_Identifier *id)
2320{
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002321 PyInterpreterState *interp = _PyInterpreterState_GET();
2322 struct _Py_unicode_ids *ids = &interp->unicode.ids;
2323
Pablo Galindoa6d63a22020-12-29 00:28:09 +00002324 Py_ssize_t index = _Py_atomic_size_get(&id->index);
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002325 if (index < 0) {
2326 struct _Py_unicode_runtime_ids *rt_ids = &interp->runtime->unicode_ids;
2327
2328 PyThread_acquire_lock(rt_ids->lock, WAIT_LOCK);
2329 // Check again to detect concurrent access. Another thread can have
2330 // initialized the index while this thread waited for the lock.
2331 index = _Py_atomic_size_get(&id->index);
2332 if (index < 0) {
2333 assert(rt_ids->next_index < PY_SSIZE_T_MAX);
2334 index = rt_ids->next_index;
2335 rt_ids->next_index++;
2336 _Py_atomic_size_set(&id->index, index);
2337 }
2338 PyThread_release_lock(rt_ids->lock);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002339 }
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002340 assert(index >= 0);
Victor Stinner297257f2020-06-02 14:39:45 +02002341
2342 PyObject *obj;
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002343 if (index < ids->size) {
2344 obj = ids->array[index];
2345 if (obj) {
2346 // Return a borrowed reference
2347 return obj;
2348 }
2349 }
2350
2351 obj = PyUnicode_DecodeUTF8Stateful(id->string, strlen(id->string),
Victor Stinner297257f2020-06-02 14:39:45 +02002352 NULL, NULL);
2353 if (!obj) {
2354 return NULL;
2355 }
2356 PyUnicode_InternInPlace(&obj);
2357
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002358 if (index >= ids->size) {
2359 // Overallocate to reduce the number of realloc
2360 Py_ssize_t new_size = Py_MAX(index * 2, 16);
2361 Py_ssize_t item_size = sizeof(ids->array[0]);
2362 PyObject **new_array = PyMem_Realloc(ids->array, new_size * item_size);
2363 if (new_array == NULL) {
2364 PyErr_NoMemory();
2365 return NULL;
2366 }
2367 memset(&new_array[ids->size], 0, (new_size - ids->size) * item_size);
2368 ids->array = new_array;
2369 ids->size = new_size;
2370 }
2371
2372 // The array stores a strong reference
2373 ids->array[index] = obj;
2374
2375 // Return a borrowed reference
2376 return obj;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002377}
2378
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002379
Victor Stinnerd6fb53f2020-05-14 01:11:54 +02002380static void
Victor Stinnerf4507232020-12-26 20:26:08 +01002381unicode_clear_identifiers(struct _Py_unicode_state *state)
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002382{
Victor Stinnerf4507232020-12-26 20:26:08 +01002383 struct _Py_unicode_ids *ids = &state->ids;
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002384 for (Py_ssize_t i=0; i < ids->size; i++) {
2385 Py_XDECREF(ids->array[i]);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002386 }
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002387 ids->size = 0;
2388 PyMem_Free(ids->array);
2389 ids->array = NULL;
2390 // Don't reset _PyRuntime next_index: _Py_Identifier.id remains valid
2391 // after Py_Finalize().
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002392}
2393
Victor Stinnerba3d67c2020-12-26 00:41:46 +01002394
Benjamin Peterson0df54292012-03-26 14:50:32 -04002395/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002396
Victor Stinnerd3f08822012-05-29 12:57:52 +02002397PyObject*
2398_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002399{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002400 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002401 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002402 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002403#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002404 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002405#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002406 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002407 }
Victor Stinner785938e2011-12-11 20:09:03 +01002408 unicode = PyUnicode_New(size, 127);
2409 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002410 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002411 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2412 assert(_PyUnicode_CheckConsistency(unicode, 1));
2413 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002414}
2415
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002416static Py_UCS4
2417kind_maxchar_limit(unsigned int kind)
2418{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002419 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002420 case PyUnicode_1BYTE_KIND:
2421 return 0x80;
2422 case PyUnicode_2BYTE_KIND:
2423 return 0x100;
2424 case PyUnicode_4BYTE_KIND:
2425 return 0x10000;
2426 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002427 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002428 }
2429}
2430
Victor Stinner702c7342011-10-05 13:50:52 +02002431static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002432_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002433{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002434 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002435 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002436
Victor Stinner2f9ada92020-06-24 02:22:21 +02002437 if (size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02002438 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner2f9ada92020-06-24 02:22:21 +02002439 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002440 assert(size > 0);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002441 if (size == 1) {
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002442 return get_latin1_char(u[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02002443 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002444
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002445 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002446 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447 if (!res)
2448 return NULL;
2449 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002450 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002451 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002452}
2453
Victor Stinnere57b1c02011-09-28 22:20:48 +02002454static PyObject*
2455_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002456{
2457 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002458 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002459
Serhiy Storchaka678db842013-01-26 12:16:36 +02002460 if (size == 0)
2461 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002462 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002463 if (size == 1)
2464 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002465
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002466 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002467 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002468 if (!res)
2469 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002470 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002471 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002472 else {
2473 _PyUnicode_CONVERT_BYTES(
2474 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2475 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002476 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002477 return res;
2478}
2479
Victor Stinnere57b1c02011-09-28 22:20:48 +02002480static PyObject*
2481_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002482{
2483 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002484 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002485
Serhiy Storchaka678db842013-01-26 12:16:36 +02002486 if (size == 0)
2487 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002488 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002489 if (size == 1)
2490 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002491
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002492 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002493 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002494 if (!res)
2495 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002496 if (max_char < 256)
2497 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2498 PyUnicode_1BYTE_DATA(res));
2499 else if (max_char < 0x10000)
2500 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2501 PyUnicode_2BYTE_DATA(res));
2502 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002503 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002504 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002505 return res;
2506}
2507
2508PyObject*
2509PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2510{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002511 if (size < 0) {
2512 PyErr_SetString(PyExc_ValueError, "size must be positive");
2513 return NULL;
2514 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002515 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002516 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002517 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002518 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002519 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002520 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002521 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002522 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002523 PyErr_SetString(PyExc_SystemError, "invalid kind");
2524 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002525 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002526}
2527
Victor Stinnerece58de2012-04-23 23:36:38 +02002528Py_UCS4
2529_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2530{
2531 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002532 const void *startptr, *endptr;
Victor Stinnerece58de2012-04-23 23:36:38 +02002533
2534 assert(PyUnicode_IS_READY(unicode));
2535 assert(0 <= start);
2536 assert(end <= PyUnicode_GET_LENGTH(unicode));
2537 assert(start <= end);
2538
2539 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2540 return PyUnicode_MAX_CHAR_VALUE(unicode);
2541
2542 if (start == end)
2543 return 127;
2544
Victor Stinner94d558b2012-04-27 22:26:58 +02002545 if (PyUnicode_IS_ASCII(unicode))
2546 return 127;
2547
Victor Stinnerece58de2012-04-23 23:36:38 +02002548 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002549 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002550 endptr = (char *)startptr + end * kind;
2551 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002552 switch(kind) {
2553 case PyUnicode_1BYTE_KIND:
2554 return ucs1lib_find_max_char(startptr, endptr);
2555 case PyUnicode_2BYTE_KIND:
2556 return ucs2lib_find_max_char(startptr, endptr);
2557 case PyUnicode_4BYTE_KIND:
2558 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002559 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002560 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002561 }
2562}
2563
Victor Stinner25a4b292011-10-06 12:31:55 +02002564/* Ensure that a string uses the most efficient storage, if it is not the
2565 case: create a new string with of the right kind. Write NULL into *p_unicode
2566 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002567static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002568unicode_adjust_maxchar(PyObject **p_unicode)
2569{
2570 PyObject *unicode, *copy;
2571 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002572 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002573 unsigned int kind;
2574
2575 assert(p_unicode != NULL);
2576 unicode = *p_unicode;
2577 assert(PyUnicode_IS_READY(unicode));
2578 if (PyUnicode_IS_ASCII(unicode))
2579 return;
2580
2581 len = PyUnicode_GET_LENGTH(unicode);
2582 kind = PyUnicode_KIND(unicode);
2583 if (kind == PyUnicode_1BYTE_KIND) {
2584 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002585 max_char = ucs1lib_find_max_char(u, u + len);
2586 if (max_char >= 128)
2587 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002588 }
2589 else if (kind == PyUnicode_2BYTE_KIND) {
2590 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002591 max_char = ucs2lib_find_max_char(u, u + len);
2592 if (max_char >= 256)
2593 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002594 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002595 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002596 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002597 max_char = ucs4lib_find_max_char(u, u + len);
2598 if (max_char >= 0x10000)
2599 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002600 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002601 else
2602 Py_UNREACHABLE();
2603
Victor Stinner25a4b292011-10-06 12:31:55 +02002604 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002605 if (copy != NULL)
2606 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002607 Py_DECREF(unicode);
2608 *p_unicode = copy;
2609}
2610
Victor Stinner034f6cf2011-09-30 02:26:44 +02002611PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002612_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002613{
Victor Stinner87af4f22011-11-21 23:03:47 +01002614 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002615 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002616
Victor Stinner034f6cf2011-09-30 02:26:44 +02002617 if (!PyUnicode_Check(unicode)) {
2618 PyErr_BadInternalCall();
2619 return NULL;
2620 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002621 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002622 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002623
Victor Stinner87af4f22011-11-21 23:03:47 +01002624 length = PyUnicode_GET_LENGTH(unicode);
2625 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002626 if (!copy)
2627 return NULL;
2628 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2629
Christian Heimesf051e432016-09-13 20:22:02 +02002630 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002631 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002632 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002633 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002634}
2635
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002636
Victor Stinnerbc603d12011-10-02 01:00:40 +02002637/* Widen Unicode objects to larger buffers. Don't write terminating null
2638 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002639
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002640static void*
2641unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002642{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002643 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002644
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002645 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002646 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002647 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002648 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002649 if (!result)
2650 return PyErr_NoMemory();
2651 assert(skind == PyUnicode_1BYTE_KIND);
2652 _PyUnicode_CONVERT_BYTES(
2653 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002654 (const Py_UCS1 *)data,
2655 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002656 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002657 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002658 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002659 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002660 if (!result)
2661 return PyErr_NoMemory();
2662 if (skind == PyUnicode_2BYTE_KIND) {
2663 _PyUnicode_CONVERT_BYTES(
2664 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002665 (const Py_UCS2 *)data,
2666 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002667 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002668 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002669 else {
2670 assert(skind == PyUnicode_1BYTE_KIND);
2671 _PyUnicode_CONVERT_BYTES(
2672 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002673 (const Py_UCS1 *)data,
2674 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002675 result);
2676 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002677 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002678 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002679 Py_UNREACHABLE();
2680 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002681 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002682}
2683
2684static Py_UCS4*
2685as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2686 int copy_null)
2687{
2688 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002689 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002690 Py_ssize_t len, targetlen;
2691 if (PyUnicode_READY(string) == -1)
2692 return NULL;
2693 kind = PyUnicode_KIND(string);
2694 data = PyUnicode_DATA(string);
2695 len = PyUnicode_GET_LENGTH(string);
2696 targetlen = len;
2697 if (copy_null)
2698 targetlen++;
2699 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002700 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002701 if (!target) {
2702 PyErr_NoMemory();
2703 return NULL;
2704 }
2705 }
2706 else {
2707 if (targetsize < targetlen) {
2708 PyErr_Format(PyExc_SystemError,
2709 "string is longer than the buffer");
2710 if (copy_null && 0 < targetsize)
2711 target[0] = 0;
2712 return NULL;
2713 }
2714 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002715 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002716 const Py_UCS1 *start = (const Py_UCS1 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002717 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002718 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002719 else if (kind == PyUnicode_2BYTE_KIND) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002720 const Py_UCS2 *start = (const Py_UCS2 *) data;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002721 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2722 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002723 else if (kind == PyUnicode_4BYTE_KIND) {
Christian Heimesf051e432016-09-13 20:22:02 +02002724 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002725 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03002726 else {
2727 Py_UNREACHABLE();
2728 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002729 if (copy_null)
2730 target[len] = 0;
2731 return target;
2732}
2733
2734Py_UCS4*
2735PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2736 int copy_null)
2737{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002738 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002739 PyErr_BadInternalCall();
2740 return NULL;
2741 }
2742 return as_ucs4(string, target, targetsize, copy_null);
2743}
2744
2745Py_UCS4*
2746PyUnicode_AsUCS4Copy(PyObject *string)
2747{
2748 return as_ucs4(string, NULL, 0, 1);
2749}
2750
Victor Stinner15a11362012-10-06 23:48:20 +02002751/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002752 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2753 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2754#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002755
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002756static int
2757unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2758 Py_ssize_t width, Py_ssize_t precision)
2759{
2760 Py_ssize_t length, fill, arglen;
2761 Py_UCS4 maxchar;
2762
2763 if (PyUnicode_READY(str) == -1)
2764 return -1;
2765
2766 length = PyUnicode_GET_LENGTH(str);
2767 if ((precision == -1 || precision >= length)
2768 && width <= length)
2769 return _PyUnicodeWriter_WriteStr(writer, str);
2770
2771 if (precision != -1)
2772 length = Py_MIN(precision, length);
2773
2774 arglen = Py_MAX(length, width);
2775 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2776 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2777 else
2778 maxchar = writer->maxchar;
2779
2780 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2781 return -1;
2782
2783 if (width > length) {
2784 fill = width - length;
2785 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2786 return -1;
2787 writer->pos += fill;
2788 }
2789
2790 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2791 str, 0, length);
2792 writer->pos += length;
2793 return 0;
2794}
2795
2796static int
Victor Stinner998b8062018-09-12 00:23:25 +02002797unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002798 Py_ssize_t width, Py_ssize_t precision)
2799{
2800 /* UTF-8 */
2801 Py_ssize_t length;
2802 PyObject *unicode;
2803 int res;
2804
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002805 if (precision == -1) {
2806 length = strlen(str);
2807 }
2808 else {
2809 length = 0;
2810 while (length < precision && str[length]) {
2811 length++;
2812 }
2813 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002814 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2815 if (unicode == NULL)
2816 return -1;
2817
2818 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2819 Py_DECREF(unicode);
2820 return res;
2821}
2822
Victor Stinner96865452011-03-01 23:44:09 +00002823static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002824unicode_fromformat_arg(_PyUnicodeWriter *writer,
2825 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002826{
Victor Stinnere215d962012-10-06 23:03:36 +02002827 const char *p;
2828 Py_ssize_t len;
2829 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002830 Py_ssize_t width;
2831 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002832 int longflag;
2833 int longlongflag;
2834 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002835 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002836
2837 p = f;
2838 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002839 zeropad = 0;
2840 if (*f == '0') {
2841 zeropad = 1;
2842 f++;
2843 }
Victor Stinner96865452011-03-01 23:44:09 +00002844
2845 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002846 width = -1;
2847 if (Py_ISDIGIT((unsigned)*f)) {
2848 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002849 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002850 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002851 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002852 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002853 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002854 return NULL;
2855 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002856 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002857 f++;
2858 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002859 }
2860 precision = -1;
2861 if (*f == '.') {
2862 f++;
2863 if (Py_ISDIGIT((unsigned)*f)) {
2864 precision = (*f - '0');
2865 f++;
2866 while (Py_ISDIGIT((unsigned)*f)) {
2867 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2868 PyErr_SetString(PyExc_ValueError,
2869 "precision too big");
2870 return NULL;
2871 }
2872 precision = (precision * 10) + (*f - '0');
2873 f++;
2874 }
2875 }
Victor Stinner96865452011-03-01 23:44:09 +00002876 if (*f == '%') {
2877 /* "%.3%s" => f points to "3" */
2878 f--;
2879 }
2880 }
2881 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002882 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002883 f--;
2884 }
Victor Stinner96865452011-03-01 23:44:09 +00002885
2886 /* Handle %ld, %lu, %lld and %llu. */
2887 longflag = 0;
2888 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002889 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002890 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002891 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002892 longflag = 1;
2893 ++f;
2894 }
Victor Stinner96865452011-03-01 23:44:09 +00002895 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002896 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002897 longlongflag = 1;
2898 f += 2;
2899 }
Victor Stinner96865452011-03-01 23:44:09 +00002900 }
2901 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002902 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002903 size_tflag = 1;
2904 ++f;
2905 }
Victor Stinnere215d962012-10-06 23:03:36 +02002906
2907 if (f[1] == '\0')
2908 writer->overallocate = 0;
2909
2910 switch (*f) {
2911 case 'c':
2912 {
2913 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002914 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002915 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002916 "character argument not in range(0x110000)");
2917 return NULL;
2918 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002919 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002920 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002921 break;
2922 }
2923
2924 case 'i':
2925 case 'd':
2926 case 'u':
2927 case 'x':
2928 {
2929 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002930 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002931 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002932
2933 if (*f == 'u') {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002934 if (longflag) {
2935 len = sprintf(buffer, "%lu", va_arg(*vargs, unsigned long));
2936 }
2937 else if (longlongflag) {
2938 len = sprintf(buffer, "%llu", va_arg(*vargs, unsigned long long));
2939 }
2940 else if (size_tflag) {
2941 len = sprintf(buffer, "%zu", va_arg(*vargs, size_t));
2942 }
2943 else {
2944 len = sprintf(buffer, "%u", va_arg(*vargs, unsigned int));
2945 }
Victor Stinnere215d962012-10-06 23:03:36 +02002946 }
2947 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002948 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002949 }
2950 else {
Victor Stinnerd36cf5f2020-06-10 18:38:05 +02002951 if (longflag) {
2952 len = sprintf(buffer, "%li", va_arg(*vargs, long));
2953 }
2954 else if (longlongflag) {
2955 len = sprintf(buffer, "%lli", va_arg(*vargs, long long));
2956 }
2957 else if (size_tflag) {
2958 len = sprintf(buffer, "%zi", va_arg(*vargs, Py_ssize_t));
2959 }
2960 else {
2961 len = sprintf(buffer, "%i", va_arg(*vargs, int));
2962 }
Victor Stinnere215d962012-10-06 23:03:36 +02002963 }
2964 assert(len >= 0);
2965
Victor Stinnere215d962012-10-06 23:03:36 +02002966 if (precision < len)
2967 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002968
2969 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002970 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2971 return NULL;
2972
Victor Stinnere215d962012-10-06 23:03:36 +02002973 if (width > precision) {
2974 Py_UCS4 fillchar;
2975 fill = width - precision;
2976 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002977 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2978 return NULL;
2979 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002980 }
Victor Stinner15a11362012-10-06 23:48:20 +02002981 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002982 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002983 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2984 return NULL;
2985 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002986 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002987
Victor Stinner4a587072013-11-19 12:54:53 +01002988 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2989 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002990 break;
2991 }
2992
2993 case 'p':
2994 {
2995 char number[MAX_LONG_LONG_CHARS];
2996
2997 len = sprintf(number, "%p", va_arg(*vargs, void*));
2998 assert(len >= 0);
2999
3000 /* %p is ill-defined: ensure leading 0x. */
3001 if (number[1] == 'X')
3002 number[1] = 'x';
3003 else if (number[1] != 'x') {
3004 memmove(number + 2, number,
3005 strlen(number) + 1);
3006 number[0] = '0';
3007 number[1] = 'x';
3008 len += 2;
3009 }
3010
Victor Stinner4a587072013-11-19 12:54:53 +01003011 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003012 return NULL;
3013 break;
3014 }
3015
3016 case 's':
3017 {
3018 /* UTF-8 */
3019 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02003020 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003021 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003022 break;
3023 }
3024
3025 case 'U':
3026 {
3027 PyObject *obj = va_arg(*vargs, PyObject *);
3028 assert(obj && _PyUnicode_CHECK(obj));
3029
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003030 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003031 return NULL;
3032 break;
3033 }
3034
3035 case 'V':
3036 {
3037 PyObject *obj = va_arg(*vargs, PyObject *);
3038 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02003039 if (obj) {
3040 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003041 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003042 return NULL;
3043 }
3044 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003045 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02003046 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003047 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003048 }
3049 break;
3050 }
3051
3052 case 'S':
3053 {
3054 PyObject *obj = va_arg(*vargs, PyObject *);
3055 PyObject *str;
3056 assert(obj);
3057 str = PyObject_Str(obj);
3058 if (!str)
3059 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003060 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003061 Py_DECREF(str);
3062 return NULL;
3063 }
3064 Py_DECREF(str);
3065 break;
3066 }
3067
3068 case 'R':
3069 {
3070 PyObject *obj = va_arg(*vargs, PyObject *);
3071 PyObject *repr;
3072 assert(obj);
3073 repr = PyObject_Repr(obj);
3074 if (!repr)
3075 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003076 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003077 Py_DECREF(repr);
3078 return NULL;
3079 }
3080 Py_DECREF(repr);
3081 break;
3082 }
3083
3084 case 'A':
3085 {
3086 PyObject *obj = va_arg(*vargs, PyObject *);
3087 PyObject *ascii;
3088 assert(obj);
3089 ascii = PyObject_ASCII(obj);
3090 if (!ascii)
3091 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02003092 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02003093 Py_DECREF(ascii);
3094 return NULL;
3095 }
3096 Py_DECREF(ascii);
3097 break;
3098 }
3099
3100 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02003101 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003102 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02003103 break;
3104
3105 default:
3106 /* if we stumble upon an unknown formatting code, copy the rest
3107 of the format string to the output string. (we cannot just
3108 skip the code, since there's no way to know what's in the
3109 argument list) */
3110 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01003111 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02003112 return NULL;
3113 f = p+len;
3114 return f;
3115 }
3116
3117 f++;
Victor Stinner96865452011-03-01 23:44:09 +00003118 return f;
3119}
3120
Walter Dörwaldd2034312007-05-18 16:29:38 +00003121PyObject *
3122PyUnicode_FromFormatV(const char *format, va_list vargs)
3123{
Victor Stinnere215d962012-10-06 23:03:36 +02003124 va_list vargs2;
3125 const char *f;
3126 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003127
Victor Stinner8f674cc2013-04-17 23:02:17 +02003128 _PyUnicodeWriter_Init(&writer);
3129 writer.min_length = strlen(format) + 100;
3130 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003131
Benjamin Peterson0c212142016-09-20 20:39:33 -07003132 // Copy varags to be able to pass a reference to a subfunction.
3133 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003134
3135 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003136 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003137 f = unicode_fromformat_arg(&writer, f, &vargs2);
3138 if (f == NULL)
3139 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003140 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003141 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003142 const char *p;
3143 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003144
Victor Stinnere215d962012-10-06 23:03:36 +02003145 p = f;
3146 do
3147 {
3148 if ((unsigned char)*p > 127) {
3149 PyErr_Format(PyExc_ValueError,
3150 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3151 "string, got a non-ASCII byte: 0x%02x",
3152 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003153 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003154 }
3155 p++;
3156 }
3157 while (*p != '\0' && *p != '%');
3158 len = p - f;
3159
3160 if (*p == '\0')
3161 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003162
3163 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003164 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003165
3166 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003167 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003168 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003169 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003170 return _PyUnicodeWriter_Finish(&writer);
3171
3172 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003173 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003174 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003175 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003176}
3177
Walter Dörwaldd2034312007-05-18 16:29:38 +00003178PyObject *
3179PyUnicode_FromFormat(const char *format, ...)
3180{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003181 PyObject* ret;
3182 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003183
3184#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003185 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003186#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003187 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003188#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003189 ret = PyUnicode_FromFormatV(format, vargs);
3190 va_end(vargs);
3191 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003192}
3193
Serhiy Storchakac46db922018-10-23 22:58:24 +03003194static Py_ssize_t
3195unicode_get_widechar_size(PyObject *unicode)
3196{
3197 Py_ssize_t res;
3198
3199 assert(unicode != NULL);
3200 assert(_PyUnicode_CHECK(unicode));
3201
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003202#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchakac46db922018-10-23 22:58:24 +03003203 if (_PyUnicode_WSTR(unicode) != NULL) {
3204 return PyUnicode_WSTR_LENGTH(unicode);
3205 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003206#endif /* USE_UNICODE_WCHAR_CACHE */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003207 assert(PyUnicode_IS_READY(unicode));
3208
3209 res = _PyUnicode_LENGTH(unicode);
3210#if SIZEOF_WCHAR_T == 2
3211 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3212 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3213 const Py_UCS4 *end = s + res;
3214 for (; s < end; ++s) {
3215 if (*s > 0xFFFF) {
3216 ++res;
3217 }
3218 }
3219 }
3220#endif
3221 return res;
3222}
3223
3224static void
3225unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3226{
Serhiy Storchakac46db922018-10-23 22:58:24 +03003227 assert(unicode != NULL);
3228 assert(_PyUnicode_CHECK(unicode));
3229
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003230#if USE_UNICODE_WCHAR_CACHE
3231 const wchar_t *wstr = _PyUnicode_WSTR(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03003232 if (wstr != NULL) {
3233 memcpy(w, wstr, size * sizeof(wchar_t));
3234 return;
3235 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03003236#else /* USE_UNICODE_WCHAR_CACHE */
3237 if (PyUnicode_KIND(unicode) == sizeof(wchar_t)) {
3238 memcpy(w, PyUnicode_DATA(unicode), size * sizeof(wchar_t));
3239 return;
3240 }
3241#endif /* USE_UNICODE_WCHAR_CACHE */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003242 assert(PyUnicode_IS_READY(unicode));
3243
3244 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3245 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3246 for (; size--; ++s, ++w) {
3247 *w = *s;
3248 }
3249 }
3250 else {
3251#if SIZEOF_WCHAR_T == 4
3252 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3253 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3254 for (; size--; ++s, ++w) {
3255 *w = *s;
3256 }
3257#else
3258 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3259 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3260 for (; size--; ++s, ++w) {
3261 Py_UCS4 ch = *s;
3262 if (ch > 0xFFFF) {
3263 assert(ch <= MAX_UNICODE);
3264 /* encode surrogate pair in this case */
3265 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3266 if (!size--)
3267 break;
3268 *w = Py_UNICODE_LOW_SURROGATE(ch);
3269 }
3270 else {
3271 *w = ch;
3272 }
3273 }
3274#endif
3275 }
3276}
3277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003278#ifdef HAVE_WCHAR_H
3279
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003280/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003281
Victor Stinnerd88d9832011-09-06 02:00:05 +02003282 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003283 character) required to convert the unicode object. Ignore size argument.
3284
Victor Stinnerd88d9832011-09-06 02:00:05 +02003285 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003286 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003287 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003288Py_ssize_t
3289PyUnicode_AsWideChar(PyObject *unicode,
3290 wchar_t *w,
3291 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003292{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003293 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003294
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003295 if (unicode == NULL) {
3296 PyErr_BadInternalCall();
3297 return -1;
3298 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003299 if (!PyUnicode_Check(unicode)) {
3300 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003301 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003302 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003303
3304 res = unicode_get_widechar_size(unicode);
3305 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003306 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003307 }
3308
3309 if (size > res) {
3310 size = res + 1;
3311 }
3312 else {
3313 res = size;
3314 }
3315 unicode_copy_as_widechar(unicode, w, size);
Jakub Kulík9032cf52021-04-30 15:21:42 +02003316
3317#if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3318 /* Oracle Solaris uses non-Unicode internal wchar_t form for
3319 non-Unicode locales and hence needs conversion first. */
3320 if (_Py_LocaleUsesNonUnicodeWchar()) {
3321 if (_Py_EncodeNonUnicodeWchar_InPlace(w, size) < 0) {
3322 return -1;
3323 }
3324 }
3325#endif
3326
Serhiy Storchakac46db922018-10-23 22:58:24 +03003327 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003328}
3329
Victor Stinner137c34c2010-09-29 10:25:54 +00003330wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003331PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003332 Py_ssize_t *size)
3333{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003334 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003335 Py_ssize_t buflen;
3336
3337 if (unicode == NULL) {
3338 PyErr_BadInternalCall();
3339 return NULL;
3340 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003341 if (!PyUnicode_Check(unicode)) {
3342 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003343 return NULL;
3344 }
3345
Serhiy Storchakac46db922018-10-23 22:58:24 +03003346 buflen = unicode_get_widechar_size(unicode);
3347 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003348 if (buffer == NULL) {
3349 PyErr_NoMemory();
3350 return NULL;
3351 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003352 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
Jakub Kulík9032cf52021-04-30 15:21:42 +02003353
3354#if HAVE_NON_UNICODE_WCHAR_T_REPRESENTATION
3355 /* Oracle Solaris uses non-Unicode internal wchar_t form for
3356 non-Unicode locales and hence needs conversion first. */
3357 if (_Py_LocaleUsesNonUnicodeWchar()) {
3358 if (_Py_EncodeNonUnicodeWchar_InPlace(buffer, (buflen + 1)) < 0) {
3359 return NULL;
3360 }
3361 }
3362#endif
3363
Serhiy Storchakac46db922018-10-23 22:58:24 +03003364 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003365 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003366 }
3367 else if (wcslen(buffer) != (size_t)buflen) {
Victor Stinner00d7abd2020-12-01 09:56:42 +01003368 PyMem_Free(buffer);
Serhiy Storchakac46db922018-10-23 22:58:24 +03003369 PyErr_SetString(PyExc_ValueError,
3370 "embedded null character");
3371 return NULL;
3372 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003373 return buffer;
3374}
3375
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003376#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003377
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003378int
3379_PyUnicode_WideCharString_Converter(PyObject *obj, void *ptr)
3380{
3381 wchar_t **p = (wchar_t **)ptr;
3382 if (obj == NULL) {
3383#if !USE_UNICODE_WCHAR_CACHE
3384 PyMem_Free(*p);
3385#endif /* USE_UNICODE_WCHAR_CACHE */
3386 *p = NULL;
3387 return 1;
3388 }
3389 if (PyUnicode_Check(obj)) {
3390#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003391 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3392 if (*p == NULL) {
3393 return 0;
3394 }
3395 return 1;
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003396#else /* USE_UNICODE_WCHAR_CACHE */
3397 *p = PyUnicode_AsWideCharString(obj, NULL);
3398 if (*p == NULL) {
3399 return 0;
3400 }
3401 return Py_CLEANUP_SUPPORTED;
3402#endif /* USE_UNICODE_WCHAR_CACHE */
3403 }
3404 PyErr_Format(PyExc_TypeError,
3405 "argument must be str, not %.50s",
Victor Stinner8182cc22020-07-10 12:40:38 +02003406 Py_TYPE(obj)->tp_name);
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003407 return 0;
3408}
3409
3410int
3411_PyUnicode_WideCharString_Opt_Converter(PyObject *obj, void *ptr)
3412{
3413 wchar_t **p = (wchar_t **)ptr;
3414 if (obj == NULL) {
3415#if !USE_UNICODE_WCHAR_CACHE
3416 PyMem_Free(*p);
3417#endif /* USE_UNICODE_WCHAR_CACHE */
3418 *p = NULL;
3419 return 1;
3420 }
3421 if (obj == Py_None) {
3422 *p = NULL;
3423 return 1;
3424 }
3425 if (PyUnicode_Check(obj)) {
3426#if USE_UNICODE_WCHAR_CACHE
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003427 *p = (wchar_t *)_PyUnicode_AsUnicode(obj);
3428 if (*p == NULL) {
3429 return 0;
3430 }
3431 return 1;
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003432#else /* USE_UNICODE_WCHAR_CACHE */
3433 *p = PyUnicode_AsWideCharString(obj, NULL);
3434 if (*p == NULL) {
3435 return 0;
3436 }
3437 return Py_CLEANUP_SUPPORTED;
3438#endif /* USE_UNICODE_WCHAR_CACHE */
3439 }
3440 PyErr_Format(PyExc_TypeError,
3441 "argument must be str or None, not %.50s",
Victor Stinner8182cc22020-07-10 12:40:38 +02003442 Py_TYPE(obj)->tp_name);
Serhiy Storchaka349f76c2020-06-30 09:03:15 +03003443 return 0;
3444}
3445
Alexander Belopolsky40018472011-02-26 01:02:56 +00003446PyObject *
3447PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003448{
Victor Stinner8faf8212011-12-08 22:14:11 +01003449 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003450 PyErr_SetString(PyExc_ValueError,
3451 "chr() arg not in range(0x110000)");
3452 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003453 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003454
Victor Stinner985a82a2014-01-03 12:53:47 +01003455 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003456}
3457
Alexander Belopolsky40018472011-02-26 01:02:56 +00003458PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003459PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003460{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003461 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003462 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003463 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003464 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003465 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003466 Py_INCREF(obj);
3467 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003468 }
3469 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003470 /* For a Unicode subtype that's not a Unicode object,
3471 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003472 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003473 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003474 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003475 "Can't convert '%.100s' object to str implicitly",
3476 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003477 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003478}
3479
Alexander Belopolsky40018472011-02-26 01:02:56 +00003480PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003481PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003482 const char *encoding,
3483 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003484{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003485 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003486 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003487
Guido van Rossumd57fd912000-03-10 22:53:23 +00003488 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003489 PyErr_BadInternalCall();
3490 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003491 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003492
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003493 /* Decoding bytes objects is the most common case and should be fast */
3494 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003495 if (PyBytes_GET_SIZE(obj) == 0) {
3496 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3497 return NULL;
3498 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003499 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003500 }
3501 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003502 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3503 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003504 }
3505
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003506 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003507 PyErr_SetString(PyExc_TypeError,
3508 "decoding str is not supported");
3509 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003510 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003511
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003512 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3513 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3514 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003515 "decoding to str: need a bytes-like object, %.80s found",
3516 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003517 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003518 }
Tim Petersced69f82003-09-16 20:30:58 +00003519
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003520 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003521 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003522 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3523 return NULL;
3524 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003525 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003526 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003527
Serhiy Storchaka05997252013-01-26 12:14:02 +02003528 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003529 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003530 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003531}
3532
Victor Stinnerebe17e02016-10-12 13:57:45 +02003533/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3534 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3535 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003536int
3537_Py_normalize_encoding(const char *encoding,
3538 char *lower,
3539 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003540{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003541 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003542 char *l;
3543 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003544 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003545
Victor Stinner942889a2016-09-05 15:40:10 -07003546 assert(encoding != NULL);
3547
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003548 e = encoding;
3549 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003550 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003551 punct = 0;
3552 while (1) {
3553 char c = *e;
3554 if (c == 0) {
3555 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003556 }
Victor Stinner942889a2016-09-05 15:40:10 -07003557
3558 if (Py_ISALNUM(c) || c == '.') {
3559 if (punct && l != lower) {
3560 if (l == l_end) {
3561 return 0;
3562 }
3563 *l++ = '_';
3564 }
3565 punct = 0;
3566
3567 if (l == l_end) {
3568 return 0;
3569 }
3570 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003571 }
3572 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003573 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003574 }
Victor Stinner942889a2016-09-05 15:40:10 -07003575
3576 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003577 }
3578 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003579 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003580}
3581
Alexander Belopolsky40018472011-02-26 01:02:56 +00003582PyObject *
3583PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003584 Py_ssize_t size,
3585 const char *encoding,
3586 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003587{
3588 PyObject *buffer = NULL, *unicode;
3589 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003590 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3591
Victor Stinner22eb6892019-06-26 00:51:05 +02003592 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3593 return NULL;
3594 }
3595
Victor Stinnered076ed2019-06-26 01:49:32 +02003596 if (size == 0) {
3597 _Py_RETURN_UNICODE_EMPTY();
3598 }
3599
Victor Stinner942889a2016-09-05 15:40:10 -07003600 if (encoding == NULL) {
3601 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3602 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003603
Fred Drakee4315f52000-05-09 19:53:39 +00003604 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003605 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3606 char *lower = buflower;
3607
3608 /* Fast paths */
3609 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3610 lower += 3;
3611 if (*lower == '_') {
3612 /* Match "utf8" and "utf_8" */
3613 lower++;
3614 }
3615
3616 if (lower[0] == '8' && lower[1] == 0) {
3617 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3618 }
3619 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3620 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3621 }
3622 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3623 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3624 }
3625 }
3626 else {
3627 if (strcmp(lower, "ascii") == 0
3628 || strcmp(lower, "us_ascii") == 0) {
3629 return PyUnicode_DecodeASCII(s, size, errors);
3630 }
Steve Dowercc16be82016-09-08 10:35:16 -07003631 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003632 else if (strcmp(lower, "mbcs") == 0) {
3633 return PyUnicode_DecodeMBCS(s, size, errors);
3634 }
3635 #endif
3636 else if (strcmp(lower, "latin1") == 0
3637 || strcmp(lower, "latin_1") == 0
3638 || strcmp(lower, "iso_8859_1") == 0
3639 || strcmp(lower, "iso8859_1") == 0) {
3640 return PyUnicode_DecodeLatin1(s, size, errors);
3641 }
3642 }
Victor Stinner37296e82010-06-10 13:36:23 +00003643 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003644
3645 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003646 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003647 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003648 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003649 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003650 if (buffer == NULL)
3651 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003652 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653 if (unicode == NULL)
3654 goto onError;
3655 if (!PyUnicode_Check(unicode)) {
3656 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003657 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003658 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003659 encoding,
3660 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003661 Py_DECREF(unicode);
3662 goto onError;
3663 }
3664 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003665 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003666
Benjamin Peterson29060642009-01-31 22:14:21 +00003667 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003668 Py_XDECREF(buffer);
3669 return NULL;
3670}
3671
Alexander Belopolsky40018472011-02-26 01:02:56 +00003672PyObject *
3673PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003674 const char *encoding,
3675 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003676{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003677 if (!PyUnicode_Check(unicode)) {
3678 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003679 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003680 }
3681
Serhiy Storchaka00939072016-10-27 21:05:49 +03003682 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3683 "PyUnicode_AsDecodedObject() is deprecated; "
3684 "use PyCodec_Decode() to decode from str", 1) < 0)
3685 return NULL;
3686
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003687 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003688 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003689
3690 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003691 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003692}
3693
Alexander Belopolsky40018472011-02-26 01:02:56 +00003694PyObject *
3695PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003696 const char *encoding,
3697 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003698{
3699 PyObject *v;
3700
3701 if (!PyUnicode_Check(unicode)) {
3702 PyErr_BadArgument();
3703 goto onError;
3704 }
3705
Serhiy Storchaka00939072016-10-27 21:05:49 +03003706 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3707 "PyUnicode_AsDecodedUnicode() is deprecated; "
3708 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3709 return NULL;
3710
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003711 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003712 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003713
3714 /* Decode via the codec registry */
3715 v = PyCodec_Decode(unicode, encoding, errors);
3716 if (v == NULL)
3717 goto onError;
3718 if (!PyUnicode_Check(v)) {
3719 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003720 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003721 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003722 encoding,
3723 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003724 Py_DECREF(v);
3725 goto onError;
3726 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003727 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003728
Benjamin Peterson29060642009-01-31 22:14:21 +00003729 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003730 return NULL;
3731}
3732
Alexander Belopolsky40018472011-02-26 01:02:56 +00003733PyObject *
3734PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003735 Py_ssize_t size,
3736 const char *encoding,
3737 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003738{
3739 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003740
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003741 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003742 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003743 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003744 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3745 Py_DECREF(unicode);
3746 return v;
3747}
3748
Alexander Belopolsky40018472011-02-26 01:02:56 +00003749PyObject *
3750PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003751 const char *encoding,
3752 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003753{
3754 PyObject *v;
3755
3756 if (!PyUnicode_Check(unicode)) {
3757 PyErr_BadArgument();
3758 goto onError;
3759 }
3760
Serhiy Storchaka00939072016-10-27 21:05:49 +03003761 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3762 "PyUnicode_AsEncodedObject() is deprecated; "
3763 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3764 "or PyCodec_Encode() for generic encoding", 1) < 0)
3765 return NULL;
3766
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003767 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003768 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003769
3770 /* Encode via the codec registry */
3771 v = PyCodec_Encode(unicode, encoding, errors);
3772 if (v == NULL)
3773 goto onError;
3774 return v;
3775
Benjamin Peterson29060642009-01-31 22:14:21 +00003776 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003777 return NULL;
3778}
3779
Victor Stinner1b579672011-12-17 05:47:23 +01003780
Victor Stinner2cba6b82018-01-10 22:46:15 +01003781static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003782unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003783 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003784{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003785 Py_ssize_t wlen;
3786 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3787 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003788 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003789 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003790
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003791 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003792 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003793 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003794 return NULL;
3795 }
3796
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003797 char *str;
3798 size_t error_pos;
3799 const char *reason;
3800 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003801 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003802 PyMem_Free(wstr);
3803
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003804 if (res != 0) {
3805 if (res == -2) {
3806 PyObject *exc;
3807 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3808 "locale", unicode,
3809 (Py_ssize_t)error_pos,
3810 (Py_ssize_t)(error_pos+1),
3811 reason);
3812 if (exc != NULL) {
3813 PyCodec_StrictErrors(exc);
3814 Py_DECREF(exc);
3815 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003816 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003817 else if (res == -3) {
3818 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3819 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003820 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003821 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003822 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003823 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003824 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003825
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003826 PyObject *bytes = PyBytes_FromString(str);
3827 PyMem_RawFree(str);
3828 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003829}
3830
Victor Stinnerad158722010-10-27 00:25:46 +00003831PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003832PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3833{
Victor Stinner709d23d2019-05-02 14:56:30 -04003834 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3835 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003836}
3837
3838PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003839PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003840{
Victor Stinner81a7be32020-04-14 15:14:01 +02003841 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02003842 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
3843 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003844 return unicode_encode_utf8(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003845 fs_codec->error_handler,
3846 fs_codec->errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003847 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003848#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02003849 else if (fs_codec->encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003850 return PyUnicode_AsEncodedString(unicode,
Victor Stinner3d17c042020-05-14 01:48:38 +02003851 fs_codec->encoding,
3852 fs_codec->errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003853 }
Victor Stinnerad158722010-10-27 00:25:46 +00003854#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003855 else {
3856 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3857 machinery is not ready and so cannot be used:
3858 use wcstombs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02003859 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
3860 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003861 assert(filesystem_errors != NULL);
3862 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3863 assert(errors != _Py_ERROR_UNKNOWN);
3864#ifdef _Py_FORCE_UTF8_FS_ENCODING
3865 return unicode_encode_utf8(unicode, errors, NULL);
3866#else
3867 return unicode_encode_locale(unicode, errors, 0);
3868#endif
3869 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003870}
3871
Alexander Belopolsky40018472011-02-26 01:02:56 +00003872PyObject *
3873PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003874 const char *encoding,
3875 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003876{
3877 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003878 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003879
Guido van Rossumd57fd912000-03-10 22:53:23 +00003880 if (!PyUnicode_Check(unicode)) {
3881 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003882 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003883 }
Fred Drakee4315f52000-05-09 19:53:39 +00003884
Victor Stinner22eb6892019-06-26 00:51:05 +02003885 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3886 return NULL;
3887 }
3888
Victor Stinner942889a2016-09-05 15:40:10 -07003889 if (encoding == NULL) {
3890 return _PyUnicode_AsUTF8String(unicode, errors);
3891 }
3892
Fred Drakee4315f52000-05-09 19:53:39 +00003893 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003894 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3895 char *lower = buflower;
3896
3897 /* Fast paths */
3898 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3899 lower += 3;
3900 if (*lower == '_') {
3901 /* Match "utf8" and "utf_8" */
3902 lower++;
3903 }
3904
3905 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003906 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003907 }
3908 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3909 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3910 }
3911 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3912 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3913 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003914 }
Victor Stinner942889a2016-09-05 15:40:10 -07003915 else {
3916 if (strcmp(lower, "ascii") == 0
3917 || strcmp(lower, "us_ascii") == 0) {
3918 return _PyUnicode_AsASCIIString(unicode, errors);
3919 }
Steve Dowercc16be82016-09-08 10:35:16 -07003920#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003921 else if (strcmp(lower, "mbcs") == 0) {
3922 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3923 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003924#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003925 else if (strcmp(lower, "latin1") == 0 ||
3926 strcmp(lower, "latin_1") == 0 ||
3927 strcmp(lower, "iso_8859_1") == 0 ||
3928 strcmp(lower, "iso8859_1") == 0) {
3929 return _PyUnicode_AsLatin1String(unicode, errors);
3930 }
3931 }
Victor Stinner37296e82010-06-10 13:36:23 +00003932 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003933
3934 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003935 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003936 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003937 return NULL;
3938
3939 /* The normal path */
3940 if (PyBytes_Check(v))
3941 return v;
3942
3943 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003944 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003945 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003946 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003947
3948 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003949 "encoder %s returned bytearray instead of bytes; "
3950 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003951 encoding);
3952 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003953 Py_DECREF(v);
3954 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003955 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003956
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003957 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3958 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003959 Py_DECREF(v);
3960 return b;
3961 }
3962
3963 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003964 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003965 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003966 encoding,
3967 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003968 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003969 return NULL;
3970}
3971
Alexander Belopolsky40018472011-02-26 01:02:56 +00003972PyObject *
3973PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003974 const char *encoding,
3975 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003976{
3977 PyObject *v;
3978
3979 if (!PyUnicode_Check(unicode)) {
3980 PyErr_BadArgument();
3981 goto onError;
3982 }
3983
Serhiy Storchaka00939072016-10-27 21:05:49 +03003984 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3985 "PyUnicode_AsEncodedUnicode() is deprecated; "
3986 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3987 return NULL;
3988
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003989 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003990 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003991
3992 /* Encode via the codec registry */
3993 v = PyCodec_Encode(unicode, encoding, errors);
3994 if (v == NULL)
3995 goto onError;
3996 if (!PyUnicode_Check(v)) {
3997 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003998 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003999 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02004000 encoding,
4001 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00004002 Py_DECREF(v);
4003 goto onError;
4004 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004005 return v;
Tim Petersced69f82003-09-16 20:30:58 +00004006
Benjamin Peterson29060642009-01-31 22:14:21 +00004007 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004008 return NULL;
4009}
4010
Victor Stinner2cba6b82018-01-10 22:46:15 +01004011static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04004012unicode_decode_locale(const char *str, Py_ssize_t len,
4013 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01004014{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004015 if (str[len] != '\0' || (size_t)len != strlen(str)) {
4016 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01004017 return NULL;
4018 }
4019
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004020 wchar_t *wstr;
4021 size_t wlen;
4022 const char *reason;
4023 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04004024 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004025 if (res != 0) {
4026 if (res == -2) {
4027 PyObject *exc;
4028 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
4029 "locale", str, len,
4030 (Py_ssize_t)wlen,
4031 (Py_ssize_t)(wlen + 1),
4032 reason);
4033 if (exc != NULL) {
4034 PyCodec_StrictErrors(exc);
4035 Py_DECREF(exc);
4036 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01004037 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02004038 else if (res == -3) {
4039 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
4040 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01004041 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004042 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01004043 }
Victor Stinner2f197072011-12-17 07:08:30 +01004044 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01004045 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01004046
4047 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
4048 PyMem_RawFree(wstr);
4049 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01004050}
4051
4052PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01004053PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
4054 const char *errors)
4055{
Victor Stinner709d23d2019-05-02 14:56:30 -04004056 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4057 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01004058}
4059
4060PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01004061PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01004062{
4063 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04004064 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
4065 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01004066}
4067
4068
4069PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00004070PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004071 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00004072 return PyUnicode_DecodeFSDefaultAndSize(s, size);
4073}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004074
Christian Heimes5894ba72007-11-04 11:43:14 +00004075PyObject*
4076PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
4077{
Victor Stinner81a7be32020-04-14 15:14:01 +02004078 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinner3d17c042020-05-14 01:48:38 +02004079 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
4080 if (fs_codec->utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04004081 return unicode_decode_utf8(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02004082 fs_codec->error_handler,
4083 fs_codec->errors,
Victor Stinner709d23d2019-05-02 14:56:30 -04004084 NULL);
4085 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004086#ifndef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +02004087 else if (fs_codec->encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08004088 return PyUnicode_Decode(s, size,
Victor Stinner3d17c042020-05-14 01:48:38 +02004089 fs_codec->encoding,
4090 fs_codec->errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004091 }
Victor Stinnerad158722010-10-27 00:25:46 +00004092#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004093 else {
4094 /* Before _PyUnicode_InitEncodings() is called, the Python codec
4095 machinery is not ready and so cannot be used:
4096 use mbstowcs() in this case. */
Victor Stinnerda7933e2020-04-13 03:04:28 +02004097 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
4098 const wchar_t *filesystem_errors = config->filesystem_errors;
Victor Stinnerbf305cc2020-02-05 17:39:57 +01004099 assert(filesystem_errors != NULL);
4100 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
4101 assert(errors != _Py_ERROR_UNKNOWN);
4102#ifdef _Py_FORCE_UTF8_FS_ENCODING
4103 return unicode_decode_utf8(s, size, errors, NULL, NULL);
4104#else
4105 return unicode_decode_locale(s, size, errors, 0);
4106#endif
4107 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00004108}
4109
Martin v. Löwis011e8422009-05-05 04:43:17 +00004110
4111int
4112PyUnicode_FSConverter(PyObject* arg, void* addr)
4113{
Brett Cannonec6ce872016-09-06 15:50:29 -07004114 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004115 PyObject *output = NULL;
4116 Py_ssize_t size;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004117 const char *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004118 if (arg == NULL) {
4119 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08004120 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004121 return 1;
4122 }
Brett Cannonec6ce872016-09-06 15:50:29 -07004123 path = PyOS_FSPath(arg);
4124 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03004125 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004126 }
Brett Cannonec6ce872016-09-06 15:50:29 -07004127 if (PyBytes_Check(path)) {
4128 output = path;
4129 }
4130 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
4131 output = PyUnicode_EncodeFSDefault(path);
4132 Py_DECREF(path);
4133 if (!output) {
4134 return 0;
4135 }
4136 assert(PyBytes_Check(output));
4137 }
4138
Victor Stinner0ea2a462010-04-30 00:22:08 +00004139 size = PyBytes_GET_SIZE(output);
4140 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02004141 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004142 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00004143 Py_DECREF(output);
4144 return 0;
4145 }
4146 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00004147 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00004148}
4149
4150
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004151int
4152PyUnicode_FSDecoder(PyObject* arg, void* addr)
4153{
Brett Cannona5711202016-09-06 19:36:01 -07004154 int is_buffer = 0;
4155 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004156 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004157 if (arg == NULL) {
4158 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03004159 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004160 return 1;
4161 }
Brett Cannona5711202016-09-06 19:36:01 -07004162
4163 is_buffer = PyObject_CheckBuffer(arg);
4164 if (!is_buffer) {
4165 path = PyOS_FSPath(arg);
4166 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03004167 return 0;
4168 }
Brett Cannona5711202016-09-06 19:36:01 -07004169 }
4170 else {
4171 path = arg;
4172 Py_INCREF(arg);
4173 }
4174
4175 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07004176 output = path;
4177 }
4178 else if (PyBytes_Check(path) || is_buffer) {
4179 PyObject *path_bytes = NULL;
4180
4181 if (!PyBytes_Check(path) &&
4182 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02004183 "path should be string, bytes, or os.PathLike, not %.200s",
4184 Py_TYPE(arg)->tp_name)) {
4185 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004186 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07004187 }
4188 path_bytes = PyBytes_FromObject(path);
4189 Py_DECREF(path);
4190 if (!path_bytes) {
4191 return 0;
4192 }
4193 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
4194 PyBytes_GET_SIZE(path_bytes));
4195 Py_DECREF(path_bytes);
4196 if (!output) {
4197 return 0;
4198 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004199 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004200 else {
4201 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02004202 "path should be string, bytes, or os.PathLike, not %.200s",
4203 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07004204 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03004205 return 0;
4206 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05004207 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02004208 Py_DECREF(output);
4209 return 0;
4210 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004211 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02004212 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03004213 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00004214 Py_DECREF(output);
4215 return 0;
4216 }
4217 *(PyObject**)addr = output;
4218 return Py_CLEANUP_SUPPORTED;
4219}
4220
4221
Inada Naoki02a4d572020-02-27 13:48:59 +09004222static int unicode_fill_utf8(PyObject *unicode);
4223
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004224const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004225PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004226{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004227 if (!PyUnicode_Check(unicode)) {
4228 PyErr_BadArgument();
4229 return NULL;
4230 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004231 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004232 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004233
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004234 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004235 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004236 return NULL;
4237 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004238 }
4239
4240 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004241 *psize = PyUnicode_UTF8_LENGTH(unicode);
4242 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004243}
4244
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004245const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004246PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004247{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004248 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4249}
4250
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004251Py_UNICODE *
4252PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4253{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004254 if (!PyUnicode_Check(unicode)) {
4255 PyErr_BadArgument();
4256 return NULL;
4257 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004258 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4259 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004260 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004261 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004262 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004263
Serhiy Storchakac46db922018-10-23 22:58:24 +03004264 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4265 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4266 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004267 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004268 }
Victor Stinner32bd68c2020-12-01 10:37:39 +01004269 w = (wchar_t *) PyObject_Malloc(sizeof(wchar_t) * (wlen + 1));
Serhiy Storchakac46db922018-10-23 22:58:24 +03004270 if (w == NULL) {
4271 PyErr_NoMemory();
4272 return NULL;
4273 }
4274 unicode_copy_as_widechar(unicode, w, wlen + 1);
4275 _PyUnicode_WSTR(unicode) = w;
4276 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4277 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004278 }
4279 }
4280 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004281 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004282 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004283}
4284
Inada Naoki2c4928d2020-06-17 20:09:44 +09004285/* Deprecated APIs */
4286
4287_Py_COMP_DIAG_PUSH
4288_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4289
Alexander Belopolsky40018472011-02-26 01:02:56 +00004290Py_UNICODE *
4291PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004292{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004293 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004294}
4295
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004296const Py_UNICODE *
4297_PyUnicode_AsUnicode(PyObject *unicode)
4298{
4299 Py_ssize_t size;
4300 const Py_UNICODE *wstr;
4301
4302 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4303 if (wstr && wcslen(wstr) != (size_t)size) {
4304 PyErr_SetString(PyExc_ValueError, "embedded null character");
4305 return NULL;
4306 }
4307 return wstr;
4308}
4309
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004310
Alexander Belopolsky40018472011-02-26 01:02:56 +00004311Py_ssize_t
4312PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004313{
4314 if (!PyUnicode_Check(unicode)) {
4315 PyErr_BadArgument();
4316 goto onError;
4317 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004318 if (_PyUnicode_WSTR(unicode) == NULL) {
4319 if (PyUnicode_AsUnicode(unicode) == NULL)
4320 goto onError;
4321 }
4322 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004323
Benjamin Peterson29060642009-01-31 22:14:21 +00004324 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004325 return -1;
4326}
4327
Inada Naoki2c4928d2020-06-17 20:09:44 +09004328_Py_COMP_DIAG_POP
4329
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004330Py_ssize_t
4331PyUnicode_GetLength(PyObject *unicode)
4332{
Victor Stinner07621332012-06-16 04:53:46 +02004333 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004334 PyErr_BadArgument();
4335 return -1;
4336 }
Victor Stinner07621332012-06-16 04:53:46 +02004337 if (PyUnicode_READY(unicode) == -1)
4338 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004339 return PyUnicode_GET_LENGTH(unicode);
4340}
4341
4342Py_UCS4
4343PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4344{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004345 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02004346 int kind;
4347
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004348 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004349 PyErr_BadArgument();
4350 return (Py_UCS4)-1;
4351 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004352 if (PyUnicode_READY(unicode) == -1) {
4353 return (Py_UCS4)-1;
4354 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004355 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004356 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004357 return (Py_UCS4)-1;
4358 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004359 data = PyUnicode_DATA(unicode);
4360 kind = PyUnicode_KIND(unicode);
4361 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004362}
4363
4364int
4365PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4366{
4367 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004368 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004369 return -1;
4370 }
Victor Stinner488fa492011-12-12 00:01:39 +01004371 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004372 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004373 PyErr_SetString(PyExc_IndexError, "string index out of range");
4374 return -1;
4375 }
Victor Stinner488fa492011-12-12 00:01:39 +01004376 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004377 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004378 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4379 PyErr_SetString(PyExc_ValueError, "character out of range");
4380 return -1;
4381 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004382 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4383 index, ch);
4384 return 0;
4385}
4386
Alexander Belopolsky40018472011-02-26 01:02:56 +00004387const char *
4388PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004389{
Victor Stinner42cb4622010-09-01 19:39:01 +00004390 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004391}
4392
Victor Stinner554f3f02010-06-16 23:33:54 +00004393/* create or adjust a UnicodeDecodeError */
4394static void
4395make_decode_exception(PyObject **exceptionObject,
4396 const char *encoding,
4397 const char *input, Py_ssize_t length,
4398 Py_ssize_t startpos, Py_ssize_t endpos,
4399 const char *reason)
4400{
4401 if (*exceptionObject == NULL) {
4402 *exceptionObject = PyUnicodeDecodeError_Create(
4403 encoding, input, length, startpos, endpos, reason);
4404 }
4405 else {
4406 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4407 goto onError;
4408 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4409 goto onError;
4410 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4411 goto onError;
4412 }
4413 return;
4414
4415onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004416 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004417}
4418
Steve Dowercc16be82016-09-08 10:35:16 -07004419#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004420static int
4421widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4422{
4423 if (newsize > *size) {
4424 wchar_t *newbuf = *buf;
4425 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4426 PyErr_NoMemory();
4427 return -1;
4428 }
4429 *buf = newbuf;
4430 }
4431 *size = newsize;
4432 return 0;
4433}
4434
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004435/* error handling callback helper:
4436 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004437 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004438 and adjust various state variables.
4439 return 0 on success, -1 on error
4440*/
4441
Alexander Belopolsky40018472011-02-26 01:02:56 +00004442static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004443unicode_decode_call_errorhandler_wchar(
4444 const char *errors, PyObject **errorHandler,
4445 const char *encoding, const char *reason,
4446 const char **input, const char **inend, Py_ssize_t *startinpos,
4447 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004448 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004449{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004450 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004451
4452 PyObject *restuple = NULL;
4453 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004454 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004455 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004456 Py_ssize_t requiredsize;
4457 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004458 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004459 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004460
4461 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004462 *errorHandler = PyCodec_LookupError(errors);
4463 if (*errorHandler == NULL)
4464 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004465 }
4466
Victor Stinner554f3f02010-06-16 23:33:54 +00004467 make_decode_exception(exceptionObject,
4468 encoding,
4469 *input, *inend - *input,
4470 *startinpos, *endinpos,
4471 reason);
4472 if (*exceptionObject == NULL)
4473 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004474
Petr Viktorinffd97532020-02-11 17:46:57 +01004475 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004476 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004477 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004478 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004479 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004480 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004481 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004482 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004483 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004484
4485 /* Copy back the bytes variables, which might have been modified by the
4486 callback */
4487 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4488 if (!inputobj)
4489 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004490 *input = PyBytes_AS_STRING(inputobj);
4491 insize = PyBytes_GET_SIZE(inputobj);
4492 *inend = *input + insize;
4493 /* we can DECREF safely, as the exception has another reference,
4494 so the object won't go away. */
4495 Py_DECREF(inputobj);
4496
4497 if (newpos<0)
4498 newpos = insize+newpos;
4499 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004500 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004501 goto onError;
4502 }
4503
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004504#if USE_UNICODE_WCHAR_CACHE
4505_Py_COMP_DIAG_PUSH
4506_Py_COMP_DIAG_IGNORE_DEPR_DECLS
4507 repwlen = PyUnicode_GetSize(repunicode);
4508 if (repwlen < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004509 goto onError;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004510_Py_COMP_DIAG_POP
4511#else /* USE_UNICODE_WCHAR_CACHE */
4512 repwlen = PyUnicode_AsWideChar(repunicode, NULL, 0);
4513 if (repwlen < 0)
4514 goto onError;
4515 repwlen--;
4516#endif /* USE_UNICODE_WCHAR_CACHE */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004517 /* need more space? (at least enough for what we
4518 have+the replacement+the rest of the string (starting
4519 at the new input position), so we won't have to check space
4520 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004521 requiredsize = *outpos;
4522 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4523 goto overflow;
4524 requiredsize += repwlen;
4525 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4526 goto overflow;
4527 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004528 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004529 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004530 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004531 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004532 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004533 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004534 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004535 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03004536 PyUnicode_AsWideChar(repunicode, *buf + *outpos, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004537 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004538 *endinpos = newpos;
4539 *inptr = *input + newpos;
4540
4541 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004542 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004543 return 0;
4544
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004545 overflow:
4546 PyErr_SetString(PyExc_OverflowError,
4547 "decoded result is too long for a Python string");
4548
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004549 onError:
4550 Py_XDECREF(restuple);
4551 return -1;
4552}
Steve Dowercc16be82016-09-08 10:35:16 -07004553#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004554
4555static int
4556unicode_decode_call_errorhandler_writer(
4557 const char *errors, PyObject **errorHandler,
4558 const char *encoding, const char *reason,
4559 const char **input, const char **inend, Py_ssize_t *startinpos,
4560 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4561 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4562{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004563 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004564
4565 PyObject *restuple = NULL;
4566 PyObject *repunicode = NULL;
4567 Py_ssize_t insize;
4568 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004569 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004570 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004571 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004572 int need_to_grow = 0;
4573 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004574
4575 if (*errorHandler == NULL) {
4576 *errorHandler = PyCodec_LookupError(errors);
4577 if (*errorHandler == NULL)
4578 goto onError;
4579 }
4580
4581 make_decode_exception(exceptionObject,
4582 encoding,
4583 *input, *inend - *input,
4584 *startinpos, *endinpos,
4585 reason);
4586 if (*exceptionObject == NULL)
4587 goto onError;
4588
Petr Viktorinffd97532020-02-11 17:46:57 +01004589 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004590 if (restuple == NULL)
4591 goto onError;
4592 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004593 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004594 goto onError;
4595 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004596 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004597 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004598
4599 /* Copy back the bytes variables, which might have been modified by the
4600 callback */
4601 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4602 if (!inputobj)
4603 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004604 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004605 *input = PyBytes_AS_STRING(inputobj);
4606 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004607 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004608 /* we can DECREF safely, as the exception has another reference,
4609 so the object won't go away. */
4610 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004611
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004612 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004613 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004614 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004615 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004616 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004617 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004618
Victor Stinner170ca6f2013-04-18 00:25:28 +02004619 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004620 if (replen > 1) {
4621 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004622 need_to_grow = 1;
4623 }
4624 new_inptr = *input + newpos;
4625 if (*inend - new_inptr > remain) {
4626 /* We don't know the decoding algorithm here so we make the worst
4627 assumption that one byte decodes to one unicode character.
4628 If unfortunately one byte could decode to more unicode characters,
4629 the decoder may write out-of-bound then. Is it possible for the
4630 algorithms using this function? */
4631 writer->min_length += *inend - new_inptr - remain;
4632 need_to_grow = 1;
4633 }
4634 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004635 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004636 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004637 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4638 goto onError;
4639 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004640 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004641 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004642
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004643 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004644 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004645
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004646 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004647 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004648 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004649
Benjamin Peterson29060642009-01-31 22:14:21 +00004650 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004651 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004652 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004653}
4654
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004655/* --- UTF-7 Codec -------------------------------------------------------- */
4656
Antoine Pitrou244651a2009-05-04 18:56:13 +00004657/* See RFC2152 for details. We encode conservatively and decode liberally. */
4658
4659/* Three simple macros defining base-64. */
4660
4661/* Is c a base-64 character? */
4662
4663#define IS_BASE64(c) \
4664 (((c) >= 'A' && (c) <= 'Z') || \
4665 ((c) >= 'a' && (c) <= 'z') || \
4666 ((c) >= '0' && (c) <= '9') || \
4667 (c) == '+' || (c) == '/')
4668
4669/* given that c is a base-64 character, what is its base-64 value? */
4670
4671#define FROM_BASE64(c) \
4672 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4673 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4674 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4675 (c) == '+' ? 62 : 63)
4676
4677/* What is the base-64 character of the bottom 6 bits of n? */
4678
4679#define TO_BASE64(n) \
4680 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4681
4682/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4683 * decoded as itself. We are permissive on decoding; the only ASCII
4684 * byte not decoding to itself is the + which begins a base64
4685 * string. */
4686
4687#define DECODE_DIRECT(c) \
4688 ((c) <= 127 && (c) != '+')
4689
4690/* The UTF-7 encoder treats ASCII characters differently according to
4691 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4692 * the above). See RFC2152. This array identifies these different
4693 * sets:
4694 * 0 : "Set D"
4695 * alphanumeric and '(),-./:?
4696 * 1 : "Set O"
4697 * !"#$%&*;<=>@[]^_`{|}
4698 * 2 : "whitespace"
4699 * ht nl cr sp
4700 * 3 : special (must be base64 encoded)
4701 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4702 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004703
Tim Petersced69f82003-09-16 20:30:58 +00004704static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004705char utf7_category[128] = {
4706/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4707 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4708/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4709 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4710/* sp ! " # $ % & ' ( ) * + , - . / */
4711 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4712/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4713 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4714/* @ A B C D E F G H I J K L M N O */
4715 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4716/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4717 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4718/* ` a b c d e f g h i j k l m n o */
4719 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4720/* p q r s t u v w x y z { | } ~ del */
4721 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004722};
4723
Antoine Pitrou244651a2009-05-04 18:56:13 +00004724/* ENCODE_DIRECT: this character should be encoded as itself. The
4725 * answer depends on whether we are encoding set O as itself, and also
4726 * on whether we are encoding whitespace as itself. RFC2152 makes it
4727 * clear that the answers to these questions vary between
4728 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004729
Antoine Pitrou244651a2009-05-04 18:56:13 +00004730#define ENCODE_DIRECT(c, directO, directWS) \
4731 ((c) < 128 && (c) > 0 && \
4732 ((utf7_category[(c)] == 0) || \
4733 (directWS && (utf7_category[(c)] == 2)) || \
4734 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004735
Alexander Belopolsky40018472011-02-26 01:02:56 +00004736PyObject *
4737PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004738 Py_ssize_t size,
4739 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004740{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004741 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4742}
4743
Antoine Pitrou244651a2009-05-04 18:56:13 +00004744/* The decoder. The only state we preserve is our read position,
4745 * i.e. how many characters we have consumed. So if we end in the
4746 * middle of a shift sequence we have to back off the read position
4747 * and the output to the beginning of the sequence, otherwise we lose
4748 * all the shift state (seen bits, number of bits seen, high
4749 * surrogate). */
4750
Alexander Belopolsky40018472011-02-26 01:02:56 +00004751PyObject *
4752PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004753 Py_ssize_t size,
4754 const char *errors,
4755 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004756{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004757 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004758 Py_ssize_t startinpos;
4759 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004760 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004761 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004762 const char *errmsg = "";
4763 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004764 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004765 unsigned int base64bits = 0;
4766 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004767 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004768 PyObject *errorHandler = NULL;
4769 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004770
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004771 if (size == 0) {
4772 if (consumed)
4773 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004774 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004775 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004776
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004777 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004778 _PyUnicodeWriter_Init(&writer);
4779 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004780
4781 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004782 e = s + size;
4783
4784 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004785 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004786 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004787 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004788
Antoine Pitrou244651a2009-05-04 18:56:13 +00004789 if (inShift) { /* in a base-64 section */
4790 if (IS_BASE64(ch)) { /* consume a base-64 character */
4791 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4792 base64bits += 6;
4793 s++;
4794 if (base64bits >= 16) {
4795 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004796 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004797 base64bits -= 16;
4798 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004799 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004800 if (surrogate) {
4801 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004802 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4803 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004804 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004805 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004806 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004807 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004808 }
4809 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004810 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004811 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004812 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004813 }
4814 }
Victor Stinner551ac952011-11-29 22:58:13 +01004815 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004816 /* first surrogate */
4817 surrogate = outCh;
4818 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004819 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004820 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004821 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004822 }
4823 }
4824 }
4825 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004826 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004827 if (base64bits > 0) { /* left-over bits */
4828 if (base64bits >= 6) {
4829 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004830 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004831 errmsg = "partial character in shift sequence";
4832 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004833 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004834 else {
4835 /* Some bits remain; they should be zero */
4836 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004837 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004838 errmsg = "non-zero padding bits in shift sequence";
4839 goto utf7Error;
4840 }
4841 }
4842 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004843 if (surrogate && DECODE_DIRECT(ch)) {
4844 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4845 goto onError;
4846 }
4847 surrogate = 0;
4848 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004849 /* '-' is absorbed; other terminating
4850 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004851 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004852 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004853 }
4854 }
4855 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004856 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004857 s++; /* consume '+' */
4858 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004859 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004860 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004861 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004862 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004863 else if (s < e && !IS_BASE64(*s)) {
4864 s++;
4865 errmsg = "ill-formed sequence";
4866 goto utf7Error;
4867 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004868 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004869 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004870 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004871 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004872 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004873 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004874 }
4875 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004876 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004877 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004878 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004879 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004880 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004881 else {
4882 startinpos = s-starts;
4883 s++;
4884 errmsg = "unexpected special character";
4885 goto utf7Error;
4886 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004887 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004888utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004889 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004890 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004891 errors, &errorHandler,
4892 "utf7", errmsg,
4893 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004894 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004895 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004896 }
4897
Antoine Pitrou244651a2009-05-04 18:56:13 +00004898 /* end of string */
4899
4900 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4901 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004902 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004903 if (surrogate ||
4904 (base64bits >= 6) ||
4905 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004906 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004907 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004908 errors, &errorHandler,
4909 "utf7", "unterminated shift sequence",
4910 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004911 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004912 goto onError;
4913 if (s < e)
4914 goto restart;
4915 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004916 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004917
4918 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004919 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004920 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004921 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004922 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004923 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004924 writer.kind, writer.data, shiftOutStart);
4925 Py_XDECREF(errorHandler);
4926 Py_XDECREF(exc);
4927 _PyUnicodeWriter_Dealloc(&writer);
4928 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004929 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004930 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004931 }
4932 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004933 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004934 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004935 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004936
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004937 Py_XDECREF(errorHandler);
4938 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004939 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004940
Benjamin Peterson29060642009-01-31 22:14:21 +00004941 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004942 Py_XDECREF(errorHandler);
4943 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004944 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004945 return NULL;
4946}
4947
4948
Alexander Belopolsky40018472011-02-26 01:02:56 +00004949PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004950_PyUnicode_EncodeUTF7(PyObject *str,
4951 int base64SetO,
4952 int base64WhiteSpace,
4953 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004954{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004955 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03004956 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004957 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004958 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004959 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004960 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004961 unsigned int base64bits = 0;
4962 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004963 char * out;
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03004964 const char * start;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004965
Benjamin Petersonbac79492012-01-14 13:34:47 -05004966 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004967 return NULL;
4968 kind = PyUnicode_KIND(str);
4969 data = PyUnicode_DATA(str);
4970 len = PyUnicode_GET_LENGTH(str);
4971
4972 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004973 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004974
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004975 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004976 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004977 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004978 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004979 if (v == NULL)
4980 return NULL;
4981
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004982 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004983 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004984 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004985
Antoine Pitrou244651a2009-05-04 18:56:13 +00004986 if (inShift) {
4987 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4988 /* shifting out */
4989 if (base64bits) { /* output remaining bits */
4990 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4991 base64buffer = 0;
4992 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004993 }
4994 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004995 /* Characters not in the BASE64 set implicitly unshift the sequence
4996 so no '-' is required, except if the character is itself a '-' */
4997 if (IS_BASE64(ch) || ch == '-') {
4998 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004999 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00005000 *out++ = (char) ch;
5001 }
5002 else {
5003 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00005004 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005005 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00005006 else { /* not in a shift sequence */
5007 if (ch == '+') {
5008 *out++ = '+';
5009 *out++ = '-';
5010 }
5011 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
5012 *out++ = (char) ch;
5013 }
5014 else {
5015 *out++ = '+';
5016 inShift = 1;
5017 goto encode_char;
5018 }
5019 }
5020 continue;
5021encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00005022 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01005023 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01005024
Antoine Pitrou244651a2009-05-04 18:56:13 +00005025 /* code first surrogate */
5026 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01005027 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00005028 while (base64bits >= 6) {
5029 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
5030 base64bits -= 6;
5031 }
5032 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01005033 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00005034 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00005035 base64bits += 16;
5036 base64buffer = (base64buffer << 16) | ch;
5037 while (base64bits >= 6) {
5038 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
5039 base64bits -= 6;
5040 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00005041 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00005042 if (base64bits)
5043 *out++= TO_BASE64(base64buffer << (6-base64bits) );
5044 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005045 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005046 if (_PyBytes_Resize(&v, out - start) < 0)
5047 return NULL;
5048 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005049}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005050PyObject *
5051PyUnicode_EncodeUTF7(const Py_UNICODE *s,
5052 Py_ssize_t size,
5053 int base64SetO,
5054 int base64WhiteSpace,
5055 const char *errors)
5056{
5057 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005058 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005059 if (tmp == NULL)
5060 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01005061 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005062 base64WhiteSpace, errors);
5063 Py_DECREF(tmp);
5064 return result;
5065}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005066
Antoine Pitrou244651a2009-05-04 18:56:13 +00005067#undef IS_BASE64
5068#undef FROM_BASE64
5069#undef TO_BASE64
5070#undef DECODE_DIRECT
5071#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00005072
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073/* --- UTF-8 Codec -------------------------------------------------------- */
5074
Alexander Belopolsky40018472011-02-26 01:02:56 +00005075PyObject *
5076PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03005077 Py_ssize_t size,
5078 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005079{
Walter Dörwald69652032004-09-07 20:24:22 +00005080 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
5081}
5082
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005083#include "stringlib/asciilib.h"
5084#include "stringlib/codecs.h"
5085#include "stringlib/undef.h"
5086
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01005087#include "stringlib/ucs1lib.h"
5088#include "stringlib/codecs.h"
5089#include "stringlib/undef.h"
5090
5091#include "stringlib/ucs2lib.h"
5092#include "stringlib/codecs.h"
5093#include "stringlib/undef.h"
5094
5095#include "stringlib/ucs4lib.h"
5096#include "stringlib/codecs.h"
5097#include "stringlib/undef.h"
5098
Ma Lina0c603c2020-10-18 22:48:38 +08005099/* Mask to quickly check whether a C 'size_t' contains a
Antoine Pitrouab868312009-01-10 15:40:25 +00005100 non-ASCII, UTF8-encoded char. */
Ma Lina0c603c2020-10-18 22:48:38 +08005101#if (SIZEOF_SIZE_T == 8)
5102# define ASCII_CHAR_MASK 0x8080808080808080ULL
5103#elif (SIZEOF_SIZE_T == 4)
5104# define ASCII_CHAR_MASK 0x80808080U
Antoine Pitrouab868312009-01-10 15:40:25 +00005105#else
Ma Lina0c603c2020-10-18 22:48:38 +08005106# error C 'size_t' size should be either 4 or 8!
Antoine Pitrouab868312009-01-10 15:40:25 +00005107#endif
5108
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005109static Py_ssize_t
5110ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005111{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005112 const char *p = start;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005113
Ma Lina0c603c2020-10-18 22:48:38 +08005114#if SIZEOF_SIZE_T <= SIZEOF_VOID_P
Jessica Clarkedec07572021-03-31 11:12:39 +01005115 assert(_Py_IS_ALIGNED(dest, ALIGNOF_SIZE_T));
5116 if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005117 /* Fast path, see in STRINGLIB(utf8_decode) for
5118 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02005119 /* Help allocation */
5120 const char *_p = p;
5121 Py_UCS1 * q = dest;
Jessica Clarkedec07572021-03-31 11:12:39 +01005122 while (_p + SIZEOF_SIZE_T <= end) {
Ma Lina0c603c2020-10-18 22:48:38 +08005123 size_t value = *(const size_t *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005124 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00005125 break;
Ma Lina0c603c2020-10-18 22:48:38 +08005126 *((size_t *)q) = value;
5127 _p += SIZEOF_SIZE_T;
5128 q += SIZEOF_SIZE_T;
Benjamin Peterson14339b62009-01-31 16:36:08 +00005129 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005130 p = _p;
5131 while (p < end) {
5132 if ((unsigned char)*p & 0x80)
5133 break;
5134 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005135 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005136 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005138#endif
5139 while (p < end) {
5140 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
5141 for an explanation. */
Jessica Clarkedec07572021-03-31 11:12:39 +01005142 if (_Py_IS_ALIGNED(p, ALIGNOF_SIZE_T)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02005143 /* Help allocation */
5144 const char *_p = p;
Jessica Clarkedec07572021-03-31 11:12:39 +01005145 while (_p + SIZEOF_SIZE_T <= end) {
Ma Lina0c603c2020-10-18 22:48:38 +08005146 size_t value = *(const size_t *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005147 if (value & ASCII_CHAR_MASK)
5148 break;
Ma Lina0c603c2020-10-18 22:48:38 +08005149 _p += SIZEOF_SIZE_T;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005150 }
5151 p = _p;
5152 if (_p == end)
5153 break;
5154 }
5155 if ((unsigned char)*p & 0x80)
5156 break;
5157 ++p;
5158 }
5159 memcpy(dest, start, p - start);
5160 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161}
Antoine Pitrouab868312009-01-10 15:40:25 +00005162
Victor Stinner709d23d2019-05-02 14:56:30 -04005163static PyObject *
5164unicode_decode_utf8(const char *s, Py_ssize_t size,
5165 _Py_error_handler error_handler, const char *errors,
5166 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01005167{
Victor Stinner785938e2011-12-11 20:09:03 +01005168 if (size == 0) {
5169 if (consumed)
5170 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005171 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01005172 }
5173
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005174 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
5175 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner2f9ada92020-06-24 02:22:21 +02005176 if (consumed) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005177 *consumed = 1;
Victor Stinner2f9ada92020-06-24 02:22:21 +02005178 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005179 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01005180 }
5181
Inada Naoki770847a2019-06-24 12:30:24 +09005182 const char *starts = s;
5183 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01005184
Inada Naoki770847a2019-06-24 12:30:24 +09005185 // fast path: try ASCII string.
5186 PyObject *u = PyUnicode_New(size, 127);
5187 if (u == NULL) {
5188 return NULL;
5189 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005190 s += ascii_decode(s, end, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09005191 if (s == end) {
5192 return u;
5193 }
5194
5195 // Use _PyUnicodeWriter after fast path is failed.
5196 _PyUnicodeWriter writer;
5197 _PyUnicodeWriter_InitWithBuffer(&writer, u);
5198 writer.pos = s - starts;
5199
5200 Py_ssize_t startinpos, endinpos;
5201 const char *errmsg = "";
5202 PyObject *error_handler_obj = NULL;
5203 PyObject *exc = NULL;
5204
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005205 while (s < end) {
5206 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005207 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02005208
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005209 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005210 if (PyUnicode_IS_ASCII(writer.buffer))
5211 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005212 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005213 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005214 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005215 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005216 } else {
5217 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005218 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005219 }
5220
5221 switch (ch) {
5222 case 0:
5223 if (s == end || consumed)
5224 goto End;
5225 errmsg = "unexpected end of data";
5226 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005227 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005228 break;
5229 case 1:
5230 errmsg = "invalid start byte";
5231 startinpos = s - starts;
5232 endinpos = startinpos + 1;
5233 break;
5234 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005235 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5236 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5237 {
5238 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005239 goto End;
5240 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005241 /* fall through */
5242 case 3:
5243 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005244 errmsg = "invalid continuation byte";
5245 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005246 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005247 break;
5248 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005249 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005250 goto onError;
5251 continue;
5252 }
5253
Victor Stinner1d65d912015-10-05 13:43:50 +02005254 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005255 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005256
5257 switch (error_handler) {
5258 case _Py_ERROR_IGNORE:
5259 s += (endinpos - startinpos);
5260 break;
5261
5262 case _Py_ERROR_REPLACE:
5263 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5264 goto onError;
5265 s += (endinpos - startinpos);
5266 break;
5267
5268 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005269 {
5270 Py_ssize_t i;
5271
Victor Stinner1d65d912015-10-05 13:43:50 +02005272 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5273 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005274 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005275 ch = (Py_UCS4)(unsigned char)(starts[i]);
5276 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5277 ch + 0xdc00);
5278 writer.pos++;
5279 }
5280 s += (endinpos - startinpos);
5281 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005282 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005283
5284 default:
5285 if (unicode_decode_call_errorhandler_writer(
5286 errors, &error_handler_obj,
5287 "utf-8", errmsg,
5288 &starts, &end, &startinpos, &endinpos, &exc, &s,
5289 &writer))
5290 goto onError;
5291 }
Victor Stinner785938e2011-12-11 20:09:03 +01005292 }
5293
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005294End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005295 if (consumed)
5296 *consumed = s - starts;
5297
Victor Stinner1d65d912015-10-05 13:43:50 +02005298 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005299 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005300 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005301
5302onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005303 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005304 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005305 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005306 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005307}
5308
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005309
Victor Stinner709d23d2019-05-02 14:56:30 -04005310PyObject *
5311PyUnicode_DecodeUTF8Stateful(const char *s,
5312 Py_ssize_t size,
5313 const char *errors,
5314 Py_ssize_t *consumed)
5315{
5316 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5317}
5318
5319
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005320/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5321 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005322
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005323 On success, write a pointer to a newly allocated wide character string into
5324 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5325 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005326
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005327 On memory allocation failure, return -1.
5328
5329 On decoding error (if surrogateescape is zero), return -2. If wlen is
5330 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5331 is not NULL, write the decoding error message into *reason. */
5332int
5333_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005334 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005335{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005336 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005337 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005338 wchar_t *unicode;
5339 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005340
Victor Stinner3d4226a2018-08-29 22:21:32 +02005341 int surrogateescape = 0;
5342 int surrogatepass = 0;
5343 switch (errors)
5344 {
5345 case _Py_ERROR_STRICT:
5346 break;
5347 case _Py_ERROR_SURROGATEESCAPE:
5348 surrogateescape = 1;
5349 break;
5350 case _Py_ERROR_SURROGATEPASS:
5351 surrogatepass = 1;
5352 break;
5353 default:
5354 return -3;
5355 }
5356
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005357 /* Note: size will always be longer than the resulting Unicode
5358 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005359 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005360 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005361 }
5362
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005363 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005364 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005365 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005366 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005367
5368 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005369 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005370 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005371 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005372 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005373#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005374 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005375#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005376 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005377#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005378 if (ch > 0xFF) {
5379#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005380 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005381#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005382 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005383 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005384 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5385 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5386#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005387 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005388 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005389 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005390 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005391 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005392
5393 if (surrogateescape) {
5394 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5395 }
5396 else {
5397 /* Is it a valid three-byte code? */
5398 if (surrogatepass
5399 && (e - s) >= 3
5400 && (s[0] & 0xf0) == 0xe0
5401 && (s[1] & 0xc0) == 0x80
5402 && (s[2] & 0xc0) == 0x80)
5403 {
5404 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5405 s += 3;
5406 unicode[outpos++] = ch;
5407 }
5408 else {
5409 PyMem_RawFree(unicode );
5410 if (reason != NULL) {
5411 switch (ch) {
5412 case 0:
5413 *reason = "unexpected end of data";
5414 break;
5415 case 1:
5416 *reason = "invalid start byte";
5417 break;
5418 /* 2, 3, 4 */
5419 default:
5420 *reason = "invalid continuation byte";
5421 break;
5422 }
5423 }
5424 if (wlen != NULL) {
5425 *wlen = s - orig_s;
5426 }
5427 return -2;
5428 }
5429 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005430 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005431 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005432 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005433 if (wlen) {
5434 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005435 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005436 *wstr = unicode;
5437 return 0;
5438}
5439
Victor Stinner5f9cf232019-03-19 01:46:25 +01005440
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005441wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005442_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5443 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005444{
5445 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005446 int res = _Py_DecodeUTF8Ex(arg, arglen,
5447 &wstr, wlen,
5448 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005449 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005450 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5451 assert(res != -3);
5452 if (wlen) {
5453 *wlen = (size_t)res;
5454 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005455 return NULL;
5456 }
5457 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005458}
5459
Antoine Pitrouab868312009-01-10 15:40:25 +00005460
Victor Stinnere47e6982017-12-21 15:45:16 +01005461/* UTF-8 encoder using the surrogateescape error handler .
5462
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005463 On success, return 0 and write the newly allocated character string (use
5464 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005465
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005466 On encoding failure, return -2 and write the position of the invalid
5467 surrogate character into *error_pos (if error_pos is set) and the decoding
5468 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005469
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005470 On memory allocation failure, return -1. */
5471int
5472_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005473 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005474{
5475 const Py_ssize_t max_char_size = 4;
5476 Py_ssize_t len = wcslen(text);
5477
5478 assert(len >= 0);
5479
Victor Stinner3d4226a2018-08-29 22:21:32 +02005480 int surrogateescape = 0;
5481 int surrogatepass = 0;
5482 switch (errors)
5483 {
5484 case _Py_ERROR_STRICT:
5485 break;
5486 case _Py_ERROR_SURROGATEESCAPE:
5487 surrogateescape = 1;
5488 break;
5489 case _Py_ERROR_SURROGATEPASS:
5490 surrogatepass = 1;
5491 break;
5492 default:
5493 return -3;
5494 }
5495
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005496 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5497 return -1;
5498 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005499 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005500 if (raw_malloc) {
5501 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005502 }
5503 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005504 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005505 }
5506 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005507 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005508 }
5509
5510 char *p = bytes;
5511 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005512 for (i = 0; i < len; ) {
5513 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005514 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005515 i++;
5516#if Py_UNICODE_SIZE == 2
5517 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5518 && i < len
5519 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5520 {
5521 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5522 i++;
5523 }
5524#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005525
5526 if (ch < 0x80) {
5527 /* Encode ASCII */
5528 *p++ = (char) ch;
5529
5530 }
5531 else if (ch < 0x0800) {
5532 /* Encode Latin-1 */
5533 *p++ = (char)(0xc0 | (ch >> 6));
5534 *p++ = (char)(0x80 | (ch & 0x3f));
5535 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005536 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005537 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005538 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005539 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005540 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005541 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005542 if (reason != NULL) {
5543 *reason = "encoding error";
5544 }
5545 if (raw_malloc) {
5546 PyMem_RawFree(bytes);
5547 }
5548 else {
5549 PyMem_Free(bytes);
5550 }
5551 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005552 }
5553 *p++ = (char)(ch & 0xff);
5554 }
5555 else if (ch < 0x10000) {
5556 *p++ = (char)(0xe0 | (ch >> 12));
5557 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5558 *p++ = (char)(0x80 | (ch & 0x3f));
5559 }
5560 else { /* ch >= 0x10000 */
5561 assert(ch <= MAX_UNICODE);
5562 /* Encode UCS4 Unicode ordinals */
5563 *p++ = (char)(0xf0 | (ch >> 18));
5564 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5565 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5566 *p++ = (char)(0x80 | (ch & 0x3f));
5567 }
5568 }
5569 *p++ = '\0';
5570
5571 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005572 char *bytes2;
5573 if (raw_malloc) {
5574 bytes2 = PyMem_RawRealloc(bytes, final_size);
5575 }
5576 else {
5577 bytes2 = PyMem_Realloc(bytes, final_size);
5578 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005579 if (bytes2 == NULL) {
5580 if (error_pos != NULL) {
5581 *error_pos = (size_t)-1;
5582 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005583 if (raw_malloc) {
5584 PyMem_RawFree(bytes);
5585 }
5586 else {
5587 PyMem_Free(bytes);
5588 }
5589 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005590 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005591 *str = bytes2;
5592 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005593}
5594
5595
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005596/* Primary internal function which creates utf8 encoded bytes objects.
5597
5598 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005599 and allocate exactly as much space needed at the end. Else allocate the
5600 maximum possible needed (4 result bytes per Unicode character), and return
5601 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005602*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005603static PyObject *
5604unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5605 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005606{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005607 if (!PyUnicode_Check(unicode)) {
5608 PyErr_BadArgument();
5609 return NULL;
5610 }
5611
5612 if (PyUnicode_READY(unicode) == -1)
5613 return NULL;
5614
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005615 if (PyUnicode_UTF8(unicode))
5616 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5617 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005618
Inada Naoki02a4d572020-02-27 13:48:59 +09005619 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005620 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005621 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5622
5623 _PyBytesWriter writer;
5624 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005625
Benjamin Petersonead6b532011-12-20 17:23:42 -06005626 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005627 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005628 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005629 case PyUnicode_1BYTE_KIND:
5630 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5631 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005632 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5633 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005634 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005635 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5636 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005637 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005638 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5639 break;
Tim Peters602f7402002-04-27 18:03:26 +00005640 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005641
5642 if (end == NULL) {
5643 _PyBytesWriter_Dealloc(&writer);
5644 return NULL;
5645 }
5646 return _PyBytesWriter_Finish(&writer, end);
5647}
5648
5649static int
5650unicode_fill_utf8(PyObject *unicode)
5651{
5652 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5653 assert(!PyUnicode_IS_ASCII(unicode));
5654
5655 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03005656 const void *data = PyUnicode_DATA(unicode);
Inada Naoki02a4d572020-02-27 13:48:59 +09005657 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5658
5659 _PyBytesWriter writer;
5660 char *end;
5661
5662 switch (kind) {
5663 default:
5664 Py_UNREACHABLE();
5665 case PyUnicode_1BYTE_KIND:
5666 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5667 _Py_ERROR_STRICT, NULL);
5668 break;
5669 case PyUnicode_2BYTE_KIND:
5670 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5671 _Py_ERROR_STRICT, NULL);
5672 break;
5673 case PyUnicode_4BYTE_KIND:
5674 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5675 _Py_ERROR_STRICT, NULL);
5676 break;
5677 }
5678 if (end == NULL) {
5679 _PyBytesWriter_Dealloc(&writer);
5680 return -1;
5681 }
5682
Serhiy Storchaka8f87eef2020-04-12 14:58:27 +03005683 const char *start = writer.use_small_buffer ? writer.small_buffer :
Inada Naoki02a4d572020-02-27 13:48:59 +09005684 PyBytes_AS_STRING(writer.buffer);
5685 Py_ssize_t len = end - start;
5686
Victor Stinner32bd68c2020-12-01 10:37:39 +01005687 char *cache = PyObject_Malloc(len + 1);
Inada Naoki02a4d572020-02-27 13:48:59 +09005688 if (cache == NULL) {
5689 _PyBytesWriter_Dealloc(&writer);
5690 PyErr_NoMemory();
5691 return -1;
5692 }
5693 _PyUnicode_UTF8(unicode) = cache;
5694 _PyUnicode_UTF8_LENGTH(unicode) = len;
5695 memcpy(cache, start, len);
5696 cache[len] = '\0';
5697 _PyBytesWriter_Dealloc(&writer);
5698 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699}
5700
Alexander Belopolsky40018472011-02-26 01:02:56 +00005701PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005702_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5703{
5704 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5705}
5706
5707
5708PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005709PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5710 Py_ssize_t size,
5711 const char *errors)
5712{
5713 PyObject *v, *unicode;
5714
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005715 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005716 if (unicode == NULL)
5717 return NULL;
5718 v = _PyUnicode_AsUTF8String(unicode, errors);
5719 Py_DECREF(unicode);
5720 return v;
5721}
5722
5723PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005724PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005726 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727}
5728
Walter Dörwald41980ca2007-08-16 21:55:45 +00005729/* --- UTF-32 Codec ------------------------------------------------------- */
5730
5731PyObject *
5732PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005733 Py_ssize_t size,
5734 const char *errors,
5735 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005736{
5737 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5738}
5739
5740PyObject *
5741PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005742 Py_ssize_t size,
5743 const char *errors,
5744 int *byteorder,
5745 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005746{
5747 const char *starts = s;
5748 Py_ssize_t startinpos;
5749 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005750 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005751 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005752 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005753 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005754 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005755 PyObject *errorHandler = NULL;
5756 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005757
Andy Lestere6be9b52020-02-11 20:28:35 -06005758 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005759 e = q + size;
5760
5761 if (byteorder)
5762 bo = *byteorder;
5763
5764 /* Check for BOM marks (U+FEFF) in the input and adjust current
5765 byte order setting accordingly. In native mode, the leading BOM
5766 mark is skipped, in all other modes, it is copied to the output
5767 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005768 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005769 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005770 if (bom == 0x0000FEFF) {
5771 bo = -1;
5772 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005773 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005774 else if (bom == 0xFFFE0000) {
5775 bo = 1;
5776 q += 4;
5777 }
5778 if (byteorder)
5779 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005780 }
5781
Victor Stinnere64322e2012-10-30 23:12:47 +01005782 if (q == e) {
5783 if (consumed)
5784 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005785 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005786 }
5787
Victor Stinnere64322e2012-10-30 23:12:47 +01005788#ifdef WORDS_BIGENDIAN
5789 le = bo < 0;
5790#else
5791 le = bo <= 0;
5792#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005793 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005794
Victor Stinner8f674cc2013-04-17 23:02:17 +02005795 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005796 writer.min_length = (e - q + 3) / 4;
5797 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005798 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005799
Victor Stinnere64322e2012-10-30 23:12:47 +01005800 while (1) {
5801 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005802 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005803
Victor Stinnere64322e2012-10-30 23:12:47 +01005804 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005805 enum PyUnicode_Kind kind = writer.kind;
5806 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005807 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005808 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005809 if (le) {
5810 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005811 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005812 if (ch > maxch)
5813 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005814 if (kind != PyUnicode_1BYTE_KIND &&
5815 Py_UNICODE_IS_SURROGATE(ch))
5816 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005817 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005818 q += 4;
5819 } while (q <= last);
5820 }
5821 else {
5822 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005823 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005824 if (ch > maxch)
5825 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005826 if (kind != PyUnicode_1BYTE_KIND &&
5827 Py_UNICODE_IS_SURROGATE(ch))
5828 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005829 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005830 q += 4;
5831 } while (q <= last);
5832 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005833 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005834 }
5835
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005836 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005837 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005838 startinpos = ((const char *)q) - starts;
5839 endinpos = startinpos + 4;
5840 }
5841 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005842 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005843 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005844 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005845 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005846 startinpos = ((const char *)q) - starts;
5847 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005848 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005849 else {
5850 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005851 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005852 goto onError;
5853 q += 4;
5854 continue;
5855 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005856 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005857 startinpos = ((const char *)q) - starts;
5858 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005859 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005860
5861 /* The remaining input chars are ignored if the callback
5862 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005863 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005864 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005865 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005866 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005867 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005868 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005869 }
5870
Walter Dörwald41980ca2007-08-16 21:55:45 +00005871 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005872 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005873
Walter Dörwald41980ca2007-08-16 21:55:45 +00005874 Py_XDECREF(errorHandler);
5875 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005876 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005877
Benjamin Peterson29060642009-01-31 22:14:21 +00005878 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005879 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005880 Py_XDECREF(errorHandler);
5881 Py_XDECREF(exc);
5882 return NULL;
5883}
5884
5885PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005886_PyUnicode_EncodeUTF32(PyObject *str,
5887 const char *errors,
5888 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005889{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005890 enum PyUnicode_Kind kind;
5891 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005892 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005893 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005894 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005895#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005896 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005897#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005898 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005899#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005900 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005901 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005902 PyObject *errorHandler = NULL;
5903 PyObject *exc = NULL;
5904 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005905
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005906 if (!PyUnicode_Check(str)) {
5907 PyErr_BadArgument();
5908 return NULL;
5909 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005910 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005911 return NULL;
5912 kind = PyUnicode_KIND(str);
5913 data = PyUnicode_DATA(str);
5914 len = PyUnicode_GET_LENGTH(str);
5915
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005916 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005917 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005918 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005919 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005920 if (v == NULL)
5921 return NULL;
5922
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005923 /* output buffer is 4-bytes aligned */
5924 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005925 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005926 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005927 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005928 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005929 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005930
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005931 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005932 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005933 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005934 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005935 else
5936 encoding = "utf-32";
5937
5938 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005939 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5940 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005941 }
5942
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005943 pos = 0;
5944 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005945 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005946
5947 if (kind == PyUnicode_2BYTE_KIND) {
5948 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5949 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005950 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005951 else {
5952 assert(kind == PyUnicode_4BYTE_KIND);
5953 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5954 &out, native_ordering);
5955 }
5956 if (pos == len)
5957 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005958
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005959 rep = unicode_encode_call_errorhandler(
5960 errors, &errorHandler,
5961 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005962 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005963 if (!rep)
5964 goto error;
5965
5966 if (PyBytes_Check(rep)) {
5967 repsize = PyBytes_GET_SIZE(rep);
5968 if (repsize & 3) {
5969 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005970 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005971 "surrogates not allowed");
5972 goto error;
5973 }
5974 moreunits = repsize / 4;
5975 }
5976 else {
5977 assert(PyUnicode_Check(rep));
5978 if (PyUnicode_READY(rep) < 0)
5979 goto error;
5980 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5981 if (!PyUnicode_IS_ASCII(rep)) {
5982 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005983 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005984 "surrogates not allowed");
5985 goto error;
5986 }
5987 }
5988
5989 /* four bytes are reserved for each surrogate */
5990 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005991 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005992 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005993 /* integer overflow */
5994 PyErr_NoMemory();
5995 goto error;
5996 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005997 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005998 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005999 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006000 }
6001
6002 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006003 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03006004 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006005 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006006 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03006007 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6008 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006009 }
6010
6011 Py_CLEAR(rep);
6012 }
6013
6014 /* Cut back to size actually needed. This is necessary for, for example,
6015 encoding of a string containing isolated surrogates and the 'ignore'
6016 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03006017 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006018 if (nsize != PyBytes_GET_SIZE(v))
6019 _PyBytes_Resize(&v, nsize);
6020 Py_XDECREF(errorHandler);
6021 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03006022 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006023 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006024 error:
6025 Py_XDECREF(rep);
6026 Py_XDECREF(errorHandler);
6027 Py_XDECREF(exc);
6028 Py_XDECREF(v);
6029 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00006030}
6031
Alexander Belopolsky40018472011-02-26 01:02:56 +00006032PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006033PyUnicode_EncodeUTF32(const Py_UNICODE *s,
6034 Py_ssize_t size,
6035 const char *errors,
6036 int byteorder)
6037{
6038 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006039 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006040 if (tmp == NULL)
6041 return NULL;
6042 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
6043 Py_DECREF(tmp);
6044 return result;
6045}
6046
6047PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006048PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00006049{
Victor Stinnerb960b342011-11-20 19:12:52 +01006050 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00006051}
6052
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053/* --- UTF-16 Codec ------------------------------------------------------- */
6054
Tim Peters772747b2001-08-09 22:21:55 +00006055PyObject *
6056PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006057 Py_ssize_t size,
6058 const char *errors,
6059 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060{
Walter Dörwald69652032004-09-07 20:24:22 +00006061 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
6062}
6063
6064PyObject *
6065PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00006066 Py_ssize_t size,
6067 const char *errors,
6068 int *byteorder,
6069 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00006070{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006071 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006072 Py_ssize_t startinpos;
6073 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006074 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006075 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00006076 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02006077 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00006078 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006079 PyObject *errorHandler = NULL;
6080 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006081 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082
Andy Lestere6be9b52020-02-11 20:28:35 -06006083 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006084 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085
6086 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00006087 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00006089 /* Check for BOM marks (U+FEFF) in the input and adjust current
6090 byte order setting accordingly. In native mode, the leading BOM
6091 mark is skipped, in all other modes, it is copied to the output
6092 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02006093 if (bo == 0 && size >= 2) {
6094 const Py_UCS4 bom = (q[1] << 8) | q[0];
6095 if (bom == 0xFEFF) {
6096 q += 2;
6097 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006098 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02006099 else if (bom == 0xFFFE) {
6100 q += 2;
6101 bo = 1;
6102 }
6103 if (byteorder)
6104 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00006105 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106
Antoine Pitrou63065d72012-05-15 23:48:04 +02006107 if (q == e) {
6108 if (consumed)
6109 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02006110 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00006111 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02006112
Christian Heimes743e0cd2012-10-17 23:52:17 +02006113#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02006114 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006115 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00006116#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02006117 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006118 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00006119#endif
Tim Peters772747b2001-08-09 22:21:55 +00006120
Antoine Pitrou63065d72012-05-15 23:48:04 +02006121 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08006122 character count normally. Error handler will take care of
6123 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006124 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006125 writer.min_length = (e - q + 1) / 2;
6126 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006127 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006128
Antoine Pitrou63065d72012-05-15 23:48:04 +02006129 while (1) {
6130 Py_UCS4 ch = 0;
6131 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006132 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006133 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006134 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02006135 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006136 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006137 native_ordering);
6138 else
6139 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006140 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006141 native_ordering);
6142 } else if (kind == PyUnicode_2BYTE_KIND) {
6143 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006144 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006145 native_ordering);
6146 } else {
6147 assert(kind == PyUnicode_4BYTE_KIND);
6148 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006149 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02006150 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00006151 }
Antoine Pitrouab868312009-01-10 15:40:25 +00006152 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006153
Antoine Pitrou63065d72012-05-15 23:48:04 +02006154 switch (ch)
6155 {
6156 case 0:
6157 /* remaining byte at the end? (size should be even) */
6158 if (q == e || consumed)
6159 goto End;
6160 errmsg = "truncated data";
6161 startinpos = ((const char *)q) - starts;
6162 endinpos = ((const char *)e) - starts;
6163 break;
6164 /* The remaining input chars are ignored if the callback
6165 chooses to skip the input */
6166 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006167 q -= 2;
6168 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02006169 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006170 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02006171 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02006172 endinpos = ((const char *)e) - starts;
6173 break;
6174 case 2:
6175 errmsg = "illegal encoding";
6176 startinpos = ((const char *)q) - 2 - starts;
6177 endinpos = startinpos + 2;
6178 break;
6179 case 3:
6180 errmsg = "illegal UTF-16 surrogate";
6181 startinpos = ((const char *)q) - 4 - starts;
6182 endinpos = startinpos + 2;
6183 break;
6184 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02006185 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006186 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006187 continue;
6188 }
6189
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006190 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00006191 errors,
6192 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006193 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00006194 &starts,
6195 (const char **)&e,
6196 &startinpos,
6197 &endinpos,
6198 &exc,
6199 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006200 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00006201 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202 }
6203
Antoine Pitrou63065d72012-05-15 23:48:04 +02006204End:
Walter Dörwald69652032004-09-07 20:24:22 +00006205 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00006206 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00006207
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006208 Py_XDECREF(errorHandler);
6209 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006210 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211
Benjamin Peterson29060642009-01-31 22:14:21 +00006212 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006213 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006214 Py_XDECREF(errorHandler);
6215 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006216 return NULL;
6217}
6218
Tim Peters772747b2001-08-09 22:21:55 +00006219PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006220_PyUnicode_EncodeUTF16(PyObject *str,
6221 const char *errors,
6222 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006223{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006224 enum PyUnicode_Kind kind;
6225 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006226 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006227 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006228 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006229 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02006230#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006231 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006232#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006233 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00006234#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006235 const char *encoding;
6236 Py_ssize_t nsize, pos;
6237 PyObject *errorHandler = NULL;
6238 PyObject *exc = NULL;
6239 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006240
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006241 if (!PyUnicode_Check(str)) {
6242 PyErr_BadArgument();
6243 return NULL;
6244 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006245 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006246 return NULL;
6247 kind = PyUnicode_KIND(str);
6248 data = PyUnicode_DATA(str);
6249 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006250
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006251 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006252 if (kind == PyUnicode_4BYTE_KIND) {
6253 const Py_UCS4 *in = (const Py_UCS4 *)data;
6254 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006255 while (in < end) {
6256 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006257 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006258 }
6259 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006260 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006261 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006262 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006263 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006264 nsize = len + pairs + (byteorder == 0);
6265 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006266 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006268 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006270 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006271 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006272 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006273 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006274 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006275 }
6276 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006277 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006278 }
Tim Peters772747b2001-08-09 22:21:55 +00006279
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006280 if (kind == PyUnicode_1BYTE_KIND) {
6281 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6282 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006283 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006284
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006285 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006286 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006287 }
6288 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006289 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006290 }
6291 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006292 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006293 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006294
6295 pos = 0;
6296 while (pos < len) {
6297 Py_ssize_t repsize, moreunits;
6298
6299 if (kind == PyUnicode_2BYTE_KIND) {
6300 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6301 &out, native_ordering);
6302 }
6303 else {
6304 assert(kind == PyUnicode_4BYTE_KIND);
6305 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6306 &out, native_ordering);
6307 }
6308 if (pos == len)
6309 break;
6310
6311 rep = unicode_encode_call_errorhandler(
6312 errors, &errorHandler,
6313 encoding, "surrogates not allowed",
6314 str, &exc, pos, pos + 1, &pos);
6315 if (!rep)
6316 goto error;
6317
6318 if (PyBytes_Check(rep)) {
6319 repsize = PyBytes_GET_SIZE(rep);
6320 if (repsize & 1) {
6321 raise_encode_exception(&exc, encoding,
6322 str, pos - 1, pos,
6323 "surrogates not allowed");
6324 goto error;
6325 }
6326 moreunits = repsize / 2;
6327 }
6328 else {
6329 assert(PyUnicode_Check(rep));
6330 if (PyUnicode_READY(rep) < 0)
6331 goto error;
6332 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6333 if (!PyUnicode_IS_ASCII(rep)) {
6334 raise_encode_exception(&exc, encoding,
6335 str, pos - 1, pos,
6336 "surrogates not allowed");
6337 goto error;
6338 }
6339 }
6340
6341 /* two bytes are reserved for each surrogate */
6342 if (moreunits > 1) {
6343 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006344 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006345 /* integer overflow */
6346 PyErr_NoMemory();
6347 goto error;
6348 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006349 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006350 goto error;
6351 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6352 }
6353
6354 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006355 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006356 out += moreunits;
6357 } else /* rep is unicode */ {
6358 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6359 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6360 &out, native_ordering);
6361 }
6362
6363 Py_CLEAR(rep);
6364 }
6365
6366 /* Cut back to size actually needed. This is necessary for, for example,
6367 encoding of a string containing isolated surrogates and the 'ignore' handler
6368 is used. */
6369 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6370 if (nsize != PyBytes_GET_SIZE(v))
6371 _PyBytes_Resize(&v, nsize);
6372 Py_XDECREF(errorHandler);
6373 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006374 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006375 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006376 error:
6377 Py_XDECREF(rep);
6378 Py_XDECREF(errorHandler);
6379 Py_XDECREF(exc);
6380 Py_XDECREF(v);
6381 return NULL;
6382#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383}
6384
Alexander Belopolsky40018472011-02-26 01:02:56 +00006385PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006386PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6387 Py_ssize_t size,
6388 const char *errors,
6389 int byteorder)
6390{
6391 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006392 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006393 if (tmp == NULL)
6394 return NULL;
6395 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6396 Py_DECREF(tmp);
6397 return result;
6398}
6399
6400PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006401PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006403 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404}
6405
6406/* --- Unicode Escape Codec ----------------------------------------------- */
6407
Victor Stinner47e1afd2020-10-26 16:43:47 +01006408static _PyUnicode_Name_CAPI *ucnhash_capi = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006409
Alexander Belopolsky40018472011-02-26 01:02:56 +00006410PyObject *
Miss Islington (bot)0bff4cc2021-10-14 10:02:20 -07006411_PyUnicode_DecodeUnicodeEscapeInternal(const char *s,
Eric V. Smith42454af2016-10-31 09:22:08 -04006412 Py_ssize_t size,
6413 const char *errors,
Miss Islington (bot)0bff4cc2021-10-14 10:02:20 -07006414 Py_ssize_t *consumed,
Eric V. Smith42454af2016-10-31 09:22:08 -04006415 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006416{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006417 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006418 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006420 PyObject *errorHandler = NULL;
6421 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006422
Eric V. Smith42454af2016-10-31 09:22:08 -04006423 // so we can remember if we've seen an invalid escape char or not
6424 *first_invalid_escape = NULL;
6425
Victor Stinner62ec3312016-09-06 17:04:34 -07006426 if (size == 0) {
Miss Islington (bot)0bff4cc2021-10-14 10:02:20 -07006427 if (consumed) {
6428 *consumed = 0;
6429 }
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006430 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006431 }
6432 /* Escaped strings will always be longer than the resulting
6433 Unicode string, so we start with size here and then reduce the
6434 length after conversion to the true value.
6435 (but if the error callback returns a long replacement string
6436 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006437 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006438 writer.min_length = size;
6439 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6440 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006441 }
6442
Guido van Rossumd57fd912000-03-10 22:53:23 +00006443 end = s + size;
6444 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006445 unsigned char c = (unsigned char) *s++;
6446 Py_UCS4 ch;
6447 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006448 const char *message;
6449
6450#define WRITE_ASCII_CHAR(ch) \
6451 do { \
6452 assert(ch <= 127); \
6453 assert(writer.pos < writer.size); \
6454 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6455 } while(0)
6456
6457#define WRITE_CHAR(ch) \
6458 do { \
6459 if (ch <= writer.maxchar) { \
6460 assert(writer.pos < writer.size); \
6461 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6462 } \
6463 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6464 goto onError; \
6465 } \
6466 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006467
6468 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006469 if (c != '\\') {
6470 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471 continue;
6472 }
6473
Serhiy Storchaka4641afe2021-10-14 21:23:39 +03006474 Py_ssize_t startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006475 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006476 if (s >= end) {
6477 message = "\\ at end of string";
Miss Islington (bot)0bff4cc2021-10-14 10:02:20 -07006478 goto incomplete;
Victor Stinner62ec3312016-09-06 17:04:34 -07006479 }
6480 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006481
Victor Stinner62ec3312016-09-06 17:04:34 -07006482 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006483 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006484
Benjamin Peterson29060642009-01-31 22:14:21 +00006485 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006486 case '\n': continue;
6487 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6488 case '\'': WRITE_ASCII_CHAR('\''); continue;
6489 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6490 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006491 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006492 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6493 case 't': WRITE_ASCII_CHAR('\t'); continue;
6494 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6495 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006496 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006497 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006498 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006499 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500
Benjamin Peterson29060642009-01-31 22:14:21 +00006501 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502 case '0': case '1': case '2': case '3':
6503 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006504 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006505 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006506 ch = (ch<<3) + *s++ - '0';
6507 if (s < end && '0' <= *s && *s <= '7') {
6508 ch = (ch<<3) + *s++ - '0';
6509 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006511 WRITE_CHAR(ch);
6512 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513
Benjamin Peterson29060642009-01-31 22:14:21 +00006514 /* hex escapes */
6515 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006516 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006517 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006518 message = "truncated \\xXX escape";
6519 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520
Benjamin Peterson29060642009-01-31 22:14:21 +00006521 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006523 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006524 message = "truncated \\uXXXX escape";
6525 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526
Benjamin Peterson29060642009-01-31 22:14:21 +00006527 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006528 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006529 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006530 message = "truncated \\UXXXXXXXX escape";
6531 hexescape:
Miss Islington (bot)0bff4cc2021-10-14 10:02:20 -07006532 for (ch = 0; count; ++s, --count) {
6533 if (s >= end) {
6534 goto incomplete;
6535 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006536 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006537 ch <<= 4;
6538 if (c >= '0' && c <= '9') {
6539 ch += c - '0';
6540 }
6541 else if (c >= 'a' && c <= 'f') {
6542 ch += c - ('a' - 10);
6543 }
6544 else if (c >= 'A' && c <= 'F') {
6545 ch += c - ('A' - 10);
6546 }
6547 else {
Miss Islington (bot)0bff4cc2021-10-14 10:02:20 -07006548 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006549 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006550 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006551
6552 /* when we get here, ch is a 32-bit unicode character */
6553 if (ch > MAX_UNICODE) {
6554 message = "illegal Unicode character";
6555 goto error;
6556 }
6557
6558 WRITE_CHAR(ch);
6559 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006560
Benjamin Peterson29060642009-01-31 22:14:21 +00006561 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006562 case 'N':
Victor Stinner47e1afd2020-10-26 16:43:47 +01006563 if (ucnhash_capi == NULL) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006564 /* load the unicode data module */
Victor Stinner47e1afd2020-10-26 16:43:47 +01006565 ucnhash_capi = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006566 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner47e1afd2020-10-26 16:43:47 +01006567 if (ucnhash_capi == NULL) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006568 PyErr_SetString(
6569 PyExc_UnicodeError,
6570 "\\N escapes not supported (can't load unicodedata module)"
6571 );
6572 goto onError;
6573 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006574 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006575
6576 message = "malformed \\N character escape";
Miss Islington (bot)0bff4cc2021-10-14 10:02:20 -07006577 if (s >= end) {
6578 goto incomplete;
6579 }
6580 if (*s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006581 const char *start = ++s;
6582 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006583 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006584 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006585 s++;
Miss Islington (bot)0bff4cc2021-10-14 10:02:20 -07006586 if (s >= end) {
6587 goto incomplete;
6588 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006589 namelen = s - start;
Miss Islington (bot)0bff4cc2021-10-14 10:02:20 -07006590 if (namelen) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006591 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006592 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006593 ch = 0xffffffff; /* in case 'getcode' messes up */
6594 if (namelen <= INT_MAX &&
Victor Stinner920cb642020-10-26 19:19:36 +01006595 ucnhash_capi->getcode(start, (int)namelen,
Victor Stinner62ec3312016-09-06 17:04:34 -07006596 &ch, 0)) {
6597 assert(ch <= MAX_UNICODE);
6598 WRITE_CHAR(ch);
6599 continue;
6600 }
6601 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006602 }
6603 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006604 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006605
6606 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006607 if (*first_invalid_escape == NULL) {
6608 *first_invalid_escape = s-1; /* Back up one char, since we've
6609 already incremented s. */
6610 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006611 WRITE_ASCII_CHAR('\\');
6612 WRITE_CHAR(c);
6613 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006615
Miss Islington (bot)0bff4cc2021-10-14 10:02:20 -07006616 incomplete:
6617 if (consumed) {
6618 *consumed = startinpos;
6619 break;
6620 }
Serhiy Storchaka4641afe2021-10-14 21:23:39 +03006621 error:;
6622 Py_ssize_t endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006623 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006624 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006625 errors, &errorHandler,
6626 "unicodeescape", message,
6627 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006628 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006629 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006630 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006631 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006632
6633#undef WRITE_ASCII_CHAR
6634#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006636
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006637 Py_XDECREF(errorHandler);
6638 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006639 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006640
Benjamin Peterson29060642009-01-31 22:14:21 +00006641 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006642 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006643 Py_XDECREF(errorHandler);
6644 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645 return NULL;
6646}
6647
Eric V. Smith42454af2016-10-31 09:22:08 -04006648PyObject *
Miss Islington (bot)0bff4cc2021-10-14 10:02:20 -07006649_PyUnicode_DecodeUnicodeEscapeStateful(const char *s,
Eric V. Smith42454af2016-10-31 09:22:08 -04006650 Py_ssize_t size,
Miss Islington (bot)0bff4cc2021-10-14 10:02:20 -07006651 const char *errors,
6652 Py_ssize_t *consumed)
Eric V. Smith42454af2016-10-31 09:22:08 -04006653{
6654 const char *first_invalid_escape;
Miss Islington (bot)0bff4cc2021-10-14 10:02:20 -07006655 PyObject *result = _PyUnicode_DecodeUnicodeEscapeInternal(s, size, errors,
6656 consumed,
Eric V. Smith42454af2016-10-31 09:22:08 -04006657 &first_invalid_escape);
6658 if (result == NULL)
6659 return NULL;
6660 if (first_invalid_escape != NULL) {
6661 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6662 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006663 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006664 Py_DECREF(result);
6665 return NULL;
6666 }
6667 }
6668 return result;
6669}
6670
Miss Islington (bot)0bff4cc2021-10-14 10:02:20 -07006671PyObject *
6672PyUnicode_DecodeUnicodeEscape(const char *s,
6673 Py_ssize_t size,
6674 const char *errors)
6675{
6676 return _PyUnicode_DecodeUnicodeEscapeStateful(s, size, errors, NULL);
6677}
6678
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006679/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680
Alexander Belopolsky40018472011-02-26 01:02:56 +00006681PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006682PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006684 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006685 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006687 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006688 const void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006689 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690
Ezio Melottie7f90372012-10-05 03:33:31 +03006691 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006692 escape.
6693
Ezio Melottie7f90372012-10-05 03:33:31 +03006694 For UCS1 strings it's '\xxx', 4 bytes per source character.
6695 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6696 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006697 */
6698
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006699 if (!PyUnicode_Check(unicode)) {
6700 PyErr_BadArgument();
6701 return NULL;
6702 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006703 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006704 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006705 }
Victor Stinner358af132015-10-12 22:36:57 +02006706
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006707 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006708 if (len == 0) {
6709 return PyBytes_FromStringAndSize(NULL, 0);
6710 }
6711
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006712 kind = PyUnicode_KIND(unicode);
6713 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006714 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6715 bytes, and 1 byte characters 4. */
6716 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006717 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006718 return PyErr_NoMemory();
6719 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006720 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006721 if (repr == NULL) {
6722 return NULL;
6723 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006724
Victor Stinner62ec3312016-09-06 17:04:34 -07006725 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006726 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006727 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006728
Victor Stinner62ec3312016-09-06 17:04:34 -07006729 /* U+0000-U+00ff range */
6730 if (ch < 0x100) {
6731 if (ch >= ' ' && ch < 127) {
6732 if (ch != '\\') {
6733 /* Copy printable US ASCII as-is */
6734 *p++ = (char) ch;
6735 }
6736 /* Escape backslashes */
6737 else {
6738 *p++ = '\\';
6739 *p++ = '\\';
6740 }
6741 }
Victor Stinner358af132015-10-12 22:36:57 +02006742
Victor Stinner62ec3312016-09-06 17:04:34 -07006743 /* Map special whitespace to '\t', \n', '\r' */
6744 else if (ch == '\t') {
6745 *p++ = '\\';
6746 *p++ = 't';
6747 }
6748 else if (ch == '\n') {
6749 *p++ = '\\';
6750 *p++ = 'n';
6751 }
6752 else if (ch == '\r') {
6753 *p++ = '\\';
6754 *p++ = 'r';
6755 }
6756
6757 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6758 else {
6759 *p++ = '\\';
6760 *p++ = 'x';
6761 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6762 *p++ = Py_hexdigits[ch & 0x000F];
6763 }
Tim Petersced69f82003-09-16 20:30:58 +00006764 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006765 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006766 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767 *p++ = '\\';
6768 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006769 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6770 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6771 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6772 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006774 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6775 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006776
Victor Stinner62ec3312016-09-06 17:04:34 -07006777 /* Make sure that the first two digits are zero */
6778 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006779 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006780 *p++ = 'U';
6781 *p++ = '0';
6782 *p++ = '0';
6783 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6784 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6785 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6786 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6787 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6788 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006789 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791
Victor Stinner62ec3312016-09-06 17:04:34 -07006792 assert(p - PyBytes_AS_STRING(repr) > 0);
6793 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6794 return NULL;
6795 }
6796 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006797}
6798
Alexander Belopolsky40018472011-02-26 01:02:56 +00006799PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006800PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6801 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006803 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006804 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006805 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006807 }
6808
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006809 result = PyUnicode_AsUnicodeEscapeString(tmp);
6810 Py_DECREF(tmp);
6811 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812}
6813
6814/* --- Raw Unicode Escape Codec ------------------------------------------- */
6815
Alexander Belopolsky40018472011-02-26 01:02:56 +00006816PyObject *
Serhiy Storchaka4641afe2021-10-14 21:23:39 +03006817_PyUnicode_DecodeRawUnicodeEscapeStateful(const char *s,
6818 Py_ssize_t size,
6819 const char *errors,
6820 Py_ssize_t *consumed)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006822 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006823 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006824 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006825 PyObject *errorHandler = NULL;
6826 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006827
Victor Stinner62ec3312016-09-06 17:04:34 -07006828 if (size == 0) {
Serhiy Storchaka4641afe2021-10-14 21:23:39 +03006829 if (consumed) {
6830 *consumed = 0;
6831 }
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006832 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006833 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006834
Guido van Rossumd57fd912000-03-10 22:53:23 +00006835 /* Escaped strings will always be longer than the resulting
6836 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006837 length after conversion to the true value. (But decoding error
6838 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006839 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006840 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006841 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6842 goto onError;
6843 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006844
Guido van Rossumd57fd912000-03-10 22:53:23 +00006845 end = s + size;
6846 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006847 unsigned char c = (unsigned char) *s++;
6848 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006849 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006850 const char *message;
6851
6852#define WRITE_CHAR(ch) \
6853 do { \
6854 if (ch <= writer.maxchar) { \
6855 assert(writer.pos < writer.size); \
6856 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6857 } \
6858 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6859 goto onError; \
6860 } \
6861 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862
Benjamin Peterson29060642009-01-31 22:14:21 +00006863 /* Non-escape characters are interpreted as Unicode ordinals */
Serhiy Storchaka4641afe2021-10-14 21:23:39 +03006864 if (c != '\\' || (s >= end && !consumed)) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006865 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006866 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006867 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006868
Serhiy Storchaka4641afe2021-10-14 21:23:39 +03006869 Py_ssize_t startinpos = s - starts - 1;
6870 /* \ - Escapes */
6871 if (s >= end) {
6872 assert(consumed);
6873 // Set message to silent compiler warning.
6874 // Actually it is never used.
6875 message = "\\ at end of string";
6876 goto incomplete;
6877 }
6878
Victor Stinner62ec3312016-09-06 17:04:34 -07006879 c = (unsigned char) *s++;
6880 if (c == 'u') {
6881 count = 4;
6882 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006883 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006884 else if (c == 'U') {
6885 count = 8;
6886 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006887 }
6888 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006889 assert(writer.pos < writer.size);
6890 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6891 WRITE_CHAR(c);
6892 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006893 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006894
6895 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
Serhiy Storchaka4641afe2021-10-14 21:23:39 +03006896 for (ch = 0; count; ++s, --count) {
6897 if (s >= end) {
6898 goto incomplete;
6899 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006900 c = (unsigned char)*s;
6901 ch <<= 4;
6902 if (c >= '0' && c <= '9') {
6903 ch += c - '0';
6904 }
6905 else if (c >= 'a' && c <= 'f') {
6906 ch += c - ('a' - 10);
6907 }
6908 else if (c >= 'A' && c <= 'F') {
6909 ch += c - ('A' - 10);
6910 }
6911 else {
Serhiy Storchaka4641afe2021-10-14 21:23:39 +03006912 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006913 }
6914 }
Serhiy Storchaka4641afe2021-10-14 21:23:39 +03006915 if (ch > MAX_UNICODE) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006916 message = "\\Uxxxxxxxx out of range";
Serhiy Storchaka4641afe2021-10-14 21:23:39 +03006917 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006918 }
Serhiy Storchaka4641afe2021-10-14 21:23:39 +03006919 WRITE_CHAR(ch);
6920 continue;
Victor Stinner62ec3312016-09-06 17:04:34 -07006921
Serhiy Storchaka4641afe2021-10-14 21:23:39 +03006922 incomplete:
6923 if (consumed) {
6924 *consumed = startinpos;
6925 break;
6926 }
6927 error:;
6928 Py_ssize_t endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006929 writer.min_length = end - s + writer.pos;
6930 if (unicode_decode_call_errorhandler_writer(
6931 errors, &errorHandler,
6932 "rawunicodeescape", message,
6933 &starts, &end, &startinpos, &endinpos, &exc, &s,
6934 &writer)) {
6935 goto onError;
6936 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006937 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006938
6939#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006940 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006941 Py_XDECREF(errorHandler);
6942 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006943 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006944
Benjamin Peterson29060642009-01-31 22:14:21 +00006945 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006946 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006947 Py_XDECREF(errorHandler);
6948 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006949 return NULL;
Serhiy Storchaka4641afe2021-10-14 21:23:39 +03006950}
Victor Stinner62ec3312016-09-06 17:04:34 -07006951
Serhiy Storchaka4641afe2021-10-14 21:23:39 +03006952PyObject *
6953PyUnicode_DecodeRawUnicodeEscape(const char *s,
6954 Py_ssize_t size,
6955 const char *errors)
6956{
6957 return _PyUnicode_DecodeRawUnicodeEscapeStateful(s, size, errors, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958}
6959
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006960
Alexander Belopolsky40018472011-02-26 01:02:56 +00006961PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006962PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006963{
Victor Stinner62ec3312016-09-06 17:04:34 -07006964 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006965 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006966 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006967 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03006968 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006969 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006970
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006971 if (!PyUnicode_Check(unicode)) {
6972 PyErr_BadArgument();
6973 return NULL;
6974 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006975 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006976 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006977 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006978 kind = PyUnicode_KIND(unicode);
6979 data = PyUnicode_DATA(unicode);
6980 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006981 if (kind == PyUnicode_1BYTE_KIND) {
6982 return PyBytes_FromStringAndSize(data, len);
6983 }
Victor Stinner0e368262011-11-10 20:12:49 +01006984
Victor Stinner62ec3312016-09-06 17:04:34 -07006985 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6986 bytes, and 1 byte characters 4. */
6987 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006988
Victor Stinner62ec3312016-09-06 17:04:34 -07006989 if (len > PY_SSIZE_T_MAX / expandsize) {
6990 return PyErr_NoMemory();
6991 }
6992 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6993 if (repr == NULL) {
6994 return NULL;
6995 }
6996 if (len == 0) {
6997 return repr;
6998 }
6999
7000 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01007001 for (pos = 0; pos < len; pos++) {
7002 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02007003
Victor Stinner62ec3312016-09-06 17:04:34 -07007004 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
7005 if (ch < 0x100) {
7006 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00007007 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08007008 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07007009 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007010 *p++ = '\\';
7011 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02007012 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7013 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7014 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7015 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00007016 }
Victor Stinner62ec3312016-09-06 17:04:34 -07007017 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
7018 else {
7019 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
7020 *p++ = '\\';
7021 *p++ = 'U';
7022 *p++ = '0';
7023 *p++ = '0';
7024 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
7025 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
7026 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
7027 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
7028 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
7029 *p++ = Py_hexdigits[ch & 15];
7030 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00007032
Victor Stinner62ec3312016-09-06 17:04:34 -07007033 assert(p > PyBytes_AS_STRING(repr));
7034 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
7035 return NULL;
7036 }
7037 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007038}
7039
Alexander Belopolsky40018472011-02-26 01:02:56 +00007040PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01007041PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
7042 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01007044 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007045 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01007046 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00007047 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01007048 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
7049 Py_DECREF(tmp);
7050 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007051}
7052
7053/* --- Latin-1 Codec ------------------------------------------------------ */
7054
Alexander Belopolsky40018472011-02-26 01:02:56 +00007055PyObject *
7056PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007057 Py_ssize_t size,
7058 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059{
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06007061 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062}
7063
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007064/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007065static void
7066make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007067 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01007068 PyObject *unicode,
7069 Py_ssize_t startpos, Py_ssize_t endpos,
7070 const char *reason)
7071{
7072 if (*exceptionObject == NULL) {
7073 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007074 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01007075 encoding, unicode, startpos, endpos, reason);
7076 }
7077 else {
7078 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
7079 goto onError;
7080 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
7081 goto onError;
7082 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
7083 goto onError;
7084 return;
7085 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02007086 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01007087 }
7088}
7089
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007090/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007091static void
7092raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007093 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01007094 PyObject *unicode,
7095 Py_ssize_t startpos, Py_ssize_t endpos,
7096 const char *reason)
7097{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007098 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01007099 encoding, unicode, startpos, endpos, reason);
7100 if (*exceptionObject != NULL)
7101 PyCodec_StrictErrors(*exceptionObject);
7102}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007103
7104/* error handling callback helper:
7105 build arguments, call the callback and check the arguments,
7106 put the result into newpos and return the replacement string, which
7107 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007108static PyObject *
7109unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007110 PyObject **errorHandler,
7111 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007112 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007113 Py_ssize_t startpos, Py_ssize_t endpos,
7114 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007115{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02007116 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007117 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007118 PyObject *restuple;
7119 PyObject *resunicode;
7120
7121 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007122 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007123 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007124 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007125 }
7126
Benjamin Petersonbac79492012-01-14 13:34:47 -05007127 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007128 return NULL;
7129 len = PyUnicode_GET_LENGTH(unicode);
7130
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007131 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007132 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007133 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007134 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007135
Petr Viktorinffd97532020-02-11 17:46:57 +01007136 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007137 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007138 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007139 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007140 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00007141 Py_DECREF(restuple);
7142 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007143 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007144 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00007145 &resunicode, newpos)) {
7146 Py_DECREF(restuple);
7147 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007148 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007149 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
7150 PyErr_SetString(PyExc_TypeError, &argparse[3]);
7151 Py_DECREF(restuple);
7152 return NULL;
7153 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007154 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007155 *newpos = len + *newpos;
7156 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02007157 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00007158 Py_DECREF(restuple);
7159 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00007160 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007161 Py_INCREF(resunicode);
7162 Py_DECREF(restuple);
7163 return resunicode;
7164}
7165
Alexander Belopolsky40018472011-02-26 01:02:56 +00007166static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007167unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007168 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02007169 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007170{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007171 /* input state */
7172 Py_ssize_t pos=0, size;
7173 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007174 const void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007175 /* pointer into the output */
7176 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00007177 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
7178 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02007179 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007180 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02007181 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007182 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007183 /* output object */
7184 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007185
Benjamin Petersonbac79492012-01-14 13:34:47 -05007186 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007187 return NULL;
7188 size = PyUnicode_GET_LENGTH(unicode);
7189 kind = PyUnicode_KIND(unicode);
7190 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007191 /* allocate enough for a simple encoding without
7192 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00007193 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00007194 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007195
7196 _PyBytesWriter_Init(&writer);
7197 str = _PyBytesWriter_Alloc(&writer, size);
7198 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00007199 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007200
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007201 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02007202 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007203
Benjamin Peterson29060642009-01-31 22:14:21 +00007204 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02007205 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007206 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02007207 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007208 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00007209 }
Benjamin Peterson29060642009-01-31 22:14:21 +00007210 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02007211 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00007212 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007213 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007214 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00007215 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02007216
Benjamin Petersona1c1be42014-09-29 18:18:57 -04007217 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00007218 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02007219
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007220 /* Only overallocate the buffer if it's not the last write */
7221 writer.overallocate = (collend < size);
7222
Benjamin Peterson29060642009-01-31 22:14:21 +00007223 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02007224 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007225 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02007226
7227 switch (error_handler) {
7228 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007229 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00007230 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02007231
7232 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02007233 memset(str, '?', collend - collstart);
7234 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02007235 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02007236 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007237 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007238 break;
Victor Stinner50149202015-09-22 00:26:54 +02007239
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007240 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007241 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007242 writer.min_size -= (collend - collstart);
7243 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007244 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007245 if (str == NULL)
7246 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007247 pos = collend;
7248 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007249
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007250 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007251 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02007252 writer.min_size -= (collend - collstart);
7253 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02007254 unicode, collstart, collend);
7255 if (str == NULL)
7256 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007257 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00007258 break;
Victor Stinner50149202015-09-22 00:26:54 +02007259
Victor Stinnerc3713e92015-09-29 12:32:13 +02007260 case _Py_ERROR_SURROGATEESCAPE:
7261 for (i = collstart; i < collend; ++i) {
7262 ch = PyUnicode_READ(kind, data, i);
7263 if (ch < 0xdc80 || 0xdcff < ch) {
7264 /* Not a UTF-8b surrogate */
7265 break;
7266 }
7267 *str++ = (char)(ch - 0xdc00);
7268 ++pos;
7269 }
7270 if (i >= collend)
7271 break;
7272 collstart = pos;
7273 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02007274 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02007275
Benjamin Peterson29060642009-01-31 22:14:21 +00007276 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007277 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
7278 encoding, reason, unicode, &exc,
7279 collstart, collend, &newpos);
7280 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00007281 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02007282
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07007283 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08007284 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02007285
Victor Stinner6bd525b2015-10-09 13:10:05 +02007286 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007287 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007288 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007289 PyBytes_AS_STRING(rep),
7290 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007291 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007292 else {
7293 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007294
Victor Stinner6bd525b2015-10-09 13:10:05 +02007295 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007296 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007297
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007298 if (limit == 256 ?
7299 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7300 !PyUnicode_IS_ASCII(rep))
7301 {
7302 /* Not all characters are smaller than limit */
7303 raise_encode_exception(&exc, encoding, unicode,
7304 collstart, collend, reason);
7305 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007306 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007307 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7308 str = _PyBytesWriter_WriteBytes(&writer, str,
7309 PyUnicode_DATA(rep),
7310 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007311 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007312 if (str == NULL)
7313 goto onError;
7314
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007315 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007316 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007317 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007318
7319 /* If overallocation was disabled, ensure that it was the last
7320 write. Otherwise, we missed an optimization */
7321 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007322 }
7323 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007324
Victor Stinner50149202015-09-22 00:26:54 +02007325 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007326 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007327 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007328
7329 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007330 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007331 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007332 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007333 Py_XDECREF(exc);
7334 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007335}
7336
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007337/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007338PyObject *
7339PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007340 Py_ssize_t size,
7341 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007343 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007344 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007345 if (unicode == NULL)
7346 return NULL;
7347 result = unicode_encode_ucs1(unicode, errors, 256);
7348 Py_DECREF(unicode);
7349 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007350}
7351
Alexander Belopolsky40018472011-02-26 01:02:56 +00007352PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007353_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007354{
7355 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007356 PyErr_BadArgument();
7357 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007358 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007359 if (PyUnicode_READY(unicode) == -1)
7360 return NULL;
7361 /* Fast path: if it is a one-byte string, construct
7362 bytes object directly. */
7363 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7364 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7365 PyUnicode_GET_LENGTH(unicode));
7366 /* Non-Latin-1 characters present. Defer to above function to
7367 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007368 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007369}
7370
7371PyObject*
7372PyUnicode_AsLatin1String(PyObject *unicode)
7373{
7374 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375}
7376
7377/* --- 7-bit ASCII Codec -------------------------------------------------- */
7378
Alexander Belopolsky40018472011-02-26 01:02:56 +00007379PyObject *
7380PyUnicode_DecodeASCII(const char *s,
7381 Py_ssize_t size,
7382 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007383{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007384 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007385 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007386 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007387 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007388 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007389
Guido van Rossumd57fd912000-03-10 22:53:23 +00007390 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007391 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007392
Guido van Rossumd57fd912000-03-10 22:53:23 +00007393 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner2f9ada92020-06-24 02:22:21 +02007394 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner702c7342011-10-05 13:50:52 +02007395 return get_latin1_char((unsigned char)s[0]);
Victor Stinner2f9ada92020-06-24 02:22:21 +02007396 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007397
Inada Naoki770847a2019-06-24 12:30:24 +09007398 // Shortcut for simple case
7399 PyObject *u = PyUnicode_New(size, 127);
7400 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007401 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007402 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03007403 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_1BYTE_DATA(u));
Inada Naoki770847a2019-06-24 12:30:24 +09007404 if (outpos == size) {
7405 return u;
7406 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007407
Inada Naoki770847a2019-06-24 12:30:24 +09007408 _PyUnicodeWriter writer;
7409 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007410 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007411
Inada Naoki770847a2019-06-24 12:30:24 +09007412 s += outpos;
7413 int kind = writer.kind;
7414 void *data = writer.data;
7415 Py_ssize_t startinpos, endinpos;
7416
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007417 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007418 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007419 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007420 PyUnicode_WRITE(kind, data, writer.pos, c);
7421 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007422 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007423 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007424 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007425
7426 /* byte outsize range 0x00..0x7f: call the error handler */
7427
7428 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007429 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007430
7431 switch (error_handler)
7432 {
7433 case _Py_ERROR_REPLACE:
7434 case _Py_ERROR_SURROGATEESCAPE:
7435 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007436 but we may switch to UCS2 at the first write */
7437 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7438 goto onError;
7439 kind = writer.kind;
7440 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007441
7442 if (error_handler == _Py_ERROR_REPLACE)
7443 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7444 else
7445 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7446 writer.pos++;
7447 ++s;
7448 break;
7449
7450 case _Py_ERROR_IGNORE:
7451 ++s;
7452 break;
7453
7454 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007455 startinpos = s-starts;
7456 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007457 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007458 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007459 "ascii", "ordinal not in range(128)",
7460 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007461 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007462 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007463 kind = writer.kind;
7464 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007465 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007466 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007467 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007468 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007469 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007470
Benjamin Peterson29060642009-01-31 22:14:21 +00007471 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007472 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007473 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007474 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007475 return NULL;
7476}
7477
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007478/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007479PyObject *
7480PyUnicode_EncodeASCII(const Py_UNICODE *p,
7481 Py_ssize_t size,
7482 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007483{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007484 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007485 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007486 if (unicode == NULL)
7487 return NULL;
7488 result = unicode_encode_ucs1(unicode, errors, 128);
7489 Py_DECREF(unicode);
7490 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007491}
7492
Alexander Belopolsky40018472011-02-26 01:02:56 +00007493PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007494_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007495{
7496 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007497 PyErr_BadArgument();
7498 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007499 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007500 if (PyUnicode_READY(unicode) == -1)
7501 return NULL;
7502 /* Fast path: if it is an ASCII-only string, construct bytes object
7503 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007504 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007505 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7506 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007507 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007508}
7509
7510PyObject *
7511PyUnicode_AsASCIIString(PyObject *unicode)
7512{
7513 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007514}
7515
Steve Dowercc16be82016-09-08 10:35:16 -07007516#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007517
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007518/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007519
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007520#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007521#define NEED_RETRY
7522#endif
7523
Steve Dower7ebdda02019-08-21 16:22:33 -07007524/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
Christian Claussdcfbe4f2021-10-07 16:31:33 +02007525 transcoding from UTF-16), but INT_MAX / 4 performs better in
Steve Dower7ebdda02019-08-21 16:22:33 -07007526 both cases also and avoids partial characters overrunning the
7527 length limit in MultiByteToWideChar on Windows */
7528#define DECODING_CHUNK_SIZE (INT_MAX/4)
7529
Victor Stinner3a50e702011-10-18 21:21:00 +02007530#ifndef WC_ERR_INVALID_CHARS
7531# define WC_ERR_INVALID_CHARS 0x0080
7532#endif
7533
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007534static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007535code_page_name(UINT code_page, PyObject **obj)
7536{
7537 *obj = NULL;
7538 if (code_page == CP_ACP)
7539 return "mbcs";
7540 if (code_page == CP_UTF7)
7541 return "CP_UTF7";
7542 if (code_page == CP_UTF8)
7543 return "CP_UTF8";
7544
7545 *obj = PyBytes_FromFormat("cp%u", code_page);
7546 if (*obj == NULL)
7547 return NULL;
7548 return PyBytes_AS_STRING(*obj);
7549}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007550
Victor Stinner3a50e702011-10-18 21:21:00 +02007551static DWORD
7552decode_code_page_flags(UINT code_page)
7553{
7554 if (code_page == CP_UTF7) {
7555 /* The CP_UTF7 decoder only supports flags=0 */
7556 return 0;
7557 }
7558 else
7559 return MB_ERR_INVALID_CHARS;
7560}
7561
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007562/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007563 * Decode a byte string from a Windows code page into unicode object in strict
7564 * mode.
7565 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007566 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7567 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007568 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007569static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007570decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007571 wchar_t **buf,
7572 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007573 const char *in,
7574 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007575{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007576 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007577 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007578 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007579
7580 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007581 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007582 while ((outsize = MultiByteToWideChar(code_page, flags,
7583 in, insize, NULL, 0)) <= 0)
7584 {
7585 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7586 goto error;
7587 }
7588 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7589 flags = 0;
7590 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007591
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007592 /* Extend a wchar_t* buffer */
7593 Py_ssize_t n = *bufsize; /* Get the current length */
7594 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7595 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007596 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007597 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007598
7599 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007600 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7601 if (outsize <= 0)
7602 goto error;
7603 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007604
Victor Stinner3a50e702011-10-18 21:21:00 +02007605error:
7606 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7607 return -2;
7608 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007609 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007610}
7611
Victor Stinner3a50e702011-10-18 21:21:00 +02007612/*
7613 * Decode a byte string from a code page into unicode object with an error
7614 * handler.
7615 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007616 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007617 * UnicodeDecodeError exception and returns -1 on error.
7618 */
7619static int
7620decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007621 wchar_t **buf,
7622 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007623 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007624 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007625{
7626 const char *startin = in;
7627 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007628 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007629 /* Ideally, we should get reason from FormatMessage. This is the Windows
7630 2000 English version of the message. */
7631 const char *reason = "No mapping for the Unicode character exists "
7632 "in the target code page.";
7633 /* each step cannot decode more than 1 character, but a character can be
7634 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007635 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007636 int insize;
7637 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007638 PyObject *errorHandler = NULL;
7639 PyObject *exc = NULL;
7640 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007641 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007642 DWORD err;
7643 int ret = -1;
7644
7645 assert(size > 0);
7646
7647 encoding = code_page_name(code_page, &encoding_obj);
7648 if (encoding == NULL)
7649 return -1;
7650
Victor Stinner7d00cc12014-03-17 23:08:06 +01007651 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007652 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7653 UnicodeDecodeError. */
7654 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7655 if (exc != NULL) {
7656 PyCodec_StrictErrors(exc);
7657 Py_CLEAR(exc);
7658 }
7659 goto error;
7660 }
7661
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007662 /* Extend a wchar_t* buffer */
7663 Py_ssize_t n = *bufsize; /* Get the current length */
7664 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7665 PyErr_NoMemory();
7666 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007667 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007668 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7669 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007670 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007671 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007672
7673 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007674 while (in < endin)
7675 {
7676 /* Decode a character */
7677 insize = 1;
7678 do
7679 {
7680 outsize = MultiByteToWideChar(code_page, flags,
7681 in, insize,
7682 buffer, Py_ARRAY_LENGTH(buffer));
7683 if (outsize > 0)
7684 break;
7685 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007686 if (err == ERROR_INVALID_FLAGS && flags) {
7687 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7688 flags = 0;
7689 continue;
7690 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007691 if (err != ERROR_NO_UNICODE_TRANSLATION
7692 && err != ERROR_INSUFFICIENT_BUFFER)
7693 {
7694 PyErr_SetFromWindowsErr(0);
7695 goto error;
7696 }
7697 insize++;
7698 }
7699 /* 4=maximum length of a UTF-8 sequence */
7700 while (insize <= 4 && (in + insize) <= endin);
7701
7702 if (outsize <= 0) {
7703 Py_ssize_t startinpos, endinpos, outpos;
7704
Victor Stinner7d00cc12014-03-17 23:08:06 +01007705 /* last character in partial decode? */
7706 if (in + insize >= endin && !final)
7707 break;
7708
Victor Stinner3a50e702011-10-18 21:21:00 +02007709 startinpos = in - startin;
7710 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007711 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007712 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007713 errors, &errorHandler,
7714 encoding, reason,
7715 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007716 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007717 {
7718 goto error;
7719 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007720 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007721 }
7722 else {
7723 in += insize;
7724 memcpy(out, buffer, outsize * sizeof(wchar_t));
7725 out += outsize;
7726 }
7727 }
7728
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007729 /* Shrink the buffer */
7730 assert(out - *buf <= *bufsize);
7731 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007732 /* (in - startin) <= size and size is an int */
7733 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007734
7735error:
7736 Py_XDECREF(encoding_obj);
7737 Py_XDECREF(errorHandler);
7738 Py_XDECREF(exc);
7739 return ret;
7740}
7741
Victor Stinner3a50e702011-10-18 21:21:00 +02007742static PyObject *
7743decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007744 const char *s, Py_ssize_t size,
7745 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007746{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007747 wchar_t *buf = NULL;
7748 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007749 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007750
Victor Stinner3a50e702011-10-18 21:21:00 +02007751 if (code_page < 0) {
7752 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7753 return NULL;
7754 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007755 if (size < 0) {
7756 PyErr_BadInternalCall();
7757 return NULL;
7758 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007759
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007760 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007761 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007762
Victor Stinner76a31a62011-11-04 00:05:13 +01007763 do
7764 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007765#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007766 if (size > DECODING_CHUNK_SIZE) {
7767 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007768 final = 0;
7769 done = 0;
7770 }
7771 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007772#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007773 {
7774 chunk_size = (int)size;
7775 final = (consumed == NULL);
7776 done = 1;
7777 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007778
Victor Stinner76a31a62011-11-04 00:05:13 +01007779 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007780 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007781 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007782 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007783 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007784
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007785 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007786 s, chunk_size);
7787 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007788 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007789 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007790 errors, final);
7791 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007792
7793 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007794 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007795 return NULL;
7796 }
7797
7798 if (consumed)
7799 *consumed += converted;
7800
7801 s += converted;
7802 size -= converted;
7803 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007804
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007805 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7806 PyMem_Free(buf);
7807 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007808}
7809
Alexander Belopolsky40018472011-02-26 01:02:56 +00007810PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007811PyUnicode_DecodeCodePageStateful(int code_page,
7812 const char *s,
7813 Py_ssize_t size,
7814 const char *errors,
7815 Py_ssize_t *consumed)
7816{
7817 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7818}
7819
7820PyObject *
7821PyUnicode_DecodeMBCSStateful(const char *s,
7822 Py_ssize_t size,
7823 const char *errors,
7824 Py_ssize_t *consumed)
7825{
7826 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7827}
7828
7829PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007830PyUnicode_DecodeMBCS(const char *s,
7831 Py_ssize_t size,
7832 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007833{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007834 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7835}
7836
Victor Stinner3a50e702011-10-18 21:21:00 +02007837static DWORD
7838encode_code_page_flags(UINT code_page, const char *errors)
7839{
7840 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007841 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007842 }
7843 else if (code_page == CP_UTF7) {
7844 /* CP_UTF7 only supports flags=0 */
7845 return 0;
7846 }
7847 else {
7848 if (errors != NULL && strcmp(errors, "replace") == 0)
7849 return 0;
7850 else
7851 return WC_NO_BEST_FIT_CHARS;
7852 }
7853}
7854
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007855/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007856 * Encode a Unicode string to a Windows code page into a byte string in strict
7857 * mode.
7858 *
7859 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007860 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007861 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007862static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007863encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007864 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007865 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007866{
Victor Stinner554f3f02010-06-16 23:33:54 +00007867 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007868 BOOL *pusedDefaultChar = &usedDefaultChar;
7869 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007870 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007871 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007872 const DWORD flags = encode_code_page_flags(code_page, NULL);
7873 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007874 /* Create a substring so that we can get the UTF-16 representation
7875 of just the slice under consideration. */
7876 PyObject *substring;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007877 int ret = -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007878
Martin v. Löwis3d325192011-11-04 18:23:06 +01007879 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007880
Victor Stinner3a50e702011-10-18 21:21:00 +02007881 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007882 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007883 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007884 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007885
Victor Stinner2fc507f2011-11-04 20:06:39 +01007886 substring = PyUnicode_Substring(unicode, offset, offset+len);
7887 if (substring == NULL)
7888 return -1;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007889#if USE_UNICODE_WCHAR_CACHE
7890_Py_COMP_DIAG_PUSH
7891_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Victor Stinner2fc507f2011-11-04 20:06:39 +01007892 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7893 if (p == NULL) {
7894 Py_DECREF(substring);
7895 return -1;
7896 }
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007897_Py_COMP_DIAG_POP
7898#else /* USE_UNICODE_WCHAR_CACHE */
7899 p = PyUnicode_AsWideCharString(substring, &size);
7900 Py_CLEAR(substring);
7901 if (p == NULL) {
7902 return -1;
7903 }
7904#endif /* USE_UNICODE_WCHAR_CACHE */
Victor Stinner9f067f42013-06-05 00:21:31 +02007905 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007906
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007907 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007908 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007909 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007910 NULL, 0,
7911 NULL, pusedDefaultChar);
7912 if (outsize <= 0)
7913 goto error;
7914 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007915 if (pusedDefaultChar && *pusedDefaultChar) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007916 ret = -2;
7917 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007918 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007919
Victor Stinner3a50e702011-10-18 21:21:00 +02007920 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007921 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007922 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007923 if (*outbytes == NULL) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007924 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007925 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007926 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007927 }
7928 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007929 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007930 const Py_ssize_t n = PyBytes_Size(*outbytes);
7931 if (outsize > PY_SSIZE_T_MAX - n) {
7932 PyErr_NoMemory();
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007933 goto done;
Victor Stinner3a50e702011-10-18 21:21:00 +02007934 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007935 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007936 goto done;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007937 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007938 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007939 }
7940
7941 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007942 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007943 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007944 out, outsize,
7945 NULL, pusedDefaultChar);
7946 if (outsize <= 0)
7947 goto error;
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007948 if (pusedDefaultChar && *pusedDefaultChar) {
7949 ret = -2;
7950 goto done;
7951 }
7952 ret = 0;
7953
7954done:
7955#if USE_UNICODE_WCHAR_CACHE
7956 Py_DECREF(substring);
7957#else /* USE_UNICODE_WCHAR_CACHE */
7958 PyMem_Free(p);
7959#endif /* USE_UNICODE_WCHAR_CACHE */
7960 return ret;
Victor Stinner554f3f02010-06-16 23:33:54 +00007961
Victor Stinner3a50e702011-10-18 21:21:00 +02007962error:
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007963 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION) {
7964 ret = -2;
7965 goto done;
7966 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007967 PyErr_SetFromWindowsErr(0);
Serhiy Storchaka4c8f09d2020-07-10 23:26:06 +03007968 goto done;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007969}
7970
Victor Stinner3a50e702011-10-18 21:21:00 +02007971/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007972 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007973 * error handler.
7974 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007975 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007976 * -1 on other error.
7977 */
7978static int
7979encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007980 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007981 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007982{
Victor Stinner3a50e702011-10-18 21:21:00 +02007983 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007984 Py_ssize_t pos = unicode_offset;
7985 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007986 /* Ideally, we should get reason from FormatMessage. This is the Windows
7987 2000 English version of the message. */
7988 const char *reason = "invalid character";
7989 /* 4=maximum length of a UTF-8 sequence */
7990 char buffer[4];
7991 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7992 Py_ssize_t outsize;
7993 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007994 PyObject *errorHandler = NULL;
7995 PyObject *exc = NULL;
7996 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007997 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007998 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007999 PyObject *rep;
8000 int ret = -1;
8001
8002 assert(insize > 0);
8003
8004 encoding = code_page_name(code_page, &encoding_obj);
8005 if (encoding == NULL)
8006 return -1;
8007
8008 if (errors == NULL || strcmp(errors, "strict") == 0) {
8009 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
8010 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008011 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02008012 if (exc != NULL) {
8013 PyCodec_StrictErrors(exc);
8014 Py_DECREF(exc);
8015 }
8016 Py_XDECREF(encoding_obj);
8017 return -1;
8018 }
8019
8020 if (code_page != CP_UTF8 && code_page != CP_UTF7)
8021 pusedDefaultChar = &usedDefaultChar;
8022 else
8023 pusedDefaultChar = NULL;
8024
8025 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
8026 PyErr_NoMemory();
8027 goto error;
8028 }
8029 outsize = insize * Py_ARRAY_LENGTH(buffer);
8030
8031 if (*outbytes == NULL) {
8032 /* Create string object */
8033 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
8034 if (*outbytes == NULL)
8035 goto error;
8036 out = PyBytes_AS_STRING(*outbytes);
8037 }
8038 else {
8039 /* Extend string object */
8040 Py_ssize_t n = PyBytes_Size(*outbytes);
8041 if (n > PY_SSIZE_T_MAX - outsize) {
8042 PyErr_NoMemory();
8043 goto error;
8044 }
8045 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
8046 goto error;
8047 out = PyBytes_AS_STRING(*outbytes) + n;
8048 }
8049
8050 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01008051 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02008052 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01008053 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
8054 wchar_t chars[2];
8055 int charsize;
8056 if (ch < 0x10000) {
8057 chars[0] = (wchar_t)ch;
8058 charsize = 1;
8059 }
8060 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01008061 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
8062 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01008063 charsize = 2;
8064 }
8065
Victor Stinner3a50e702011-10-18 21:21:00 +02008066 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008067 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02008068 buffer, Py_ARRAY_LENGTH(buffer),
8069 NULL, pusedDefaultChar);
8070 if (outsize > 0) {
8071 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
8072 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01008073 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02008074 memcpy(out, buffer, outsize);
8075 out += outsize;
8076 continue;
8077 }
8078 }
8079 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
8080 PyErr_SetFromWindowsErr(0);
8081 goto error;
8082 }
8083
Victor Stinner3a50e702011-10-18 21:21:00 +02008084 rep = unicode_encode_call_errorhandler(
8085 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01008086 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008087 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02008088 if (rep == NULL)
8089 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01008090 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02008091
8092 if (PyBytes_Check(rep)) {
8093 outsize = PyBytes_GET_SIZE(rep);
8094 if (outsize != 1) {
8095 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8096 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
8097 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8098 Py_DECREF(rep);
8099 goto error;
8100 }
8101 out = PyBytes_AS_STRING(*outbytes) + offset;
8102 }
8103 memcpy(out, PyBytes_AS_STRING(rep), outsize);
8104 out += outsize;
8105 }
8106 else {
8107 Py_ssize_t i;
8108 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008109 const void *data;
Victor Stinner3a50e702011-10-18 21:21:00 +02008110
Benjamin Petersonbac79492012-01-14 13:34:47 -05008111 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02008112 Py_DECREF(rep);
8113 goto error;
8114 }
8115
8116 outsize = PyUnicode_GET_LENGTH(rep);
8117 if (outsize != 1) {
8118 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
8119 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
8120 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
8121 Py_DECREF(rep);
8122 goto error;
8123 }
8124 out = PyBytes_AS_STRING(*outbytes) + offset;
8125 }
8126 kind = PyUnicode_KIND(rep);
8127 data = PyUnicode_DATA(rep);
8128 for (i=0; i < outsize; i++) {
8129 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8130 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008131 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008132 encoding, unicode,
8133 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02008134 "unable to encode error handler result to ASCII");
8135 Py_DECREF(rep);
8136 goto error;
8137 }
8138 *out = (unsigned char)ch;
8139 out++;
8140 }
8141 }
8142 Py_DECREF(rep);
8143 }
8144 /* write a NUL byte */
8145 *out = 0;
8146 outsize = out - PyBytes_AS_STRING(*outbytes);
8147 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
8148 if (_PyBytes_Resize(outbytes, outsize) < 0)
8149 goto error;
8150 ret = 0;
8151
8152error:
8153 Py_XDECREF(encoding_obj);
8154 Py_XDECREF(errorHandler);
8155 Py_XDECREF(exc);
8156 return ret;
8157}
8158
Victor Stinner3a50e702011-10-18 21:21:00 +02008159static PyObject *
8160encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01008161 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02008162 const char *errors)
8163{
Martin v. Löwis3d325192011-11-04 18:23:06 +01008164 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02008165 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01008166 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01008167 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01008168
Victor Stinner29dacf22015-01-26 16:41:32 +01008169 if (!PyUnicode_Check(unicode)) {
8170 PyErr_BadArgument();
8171 return NULL;
8172 }
8173
Benjamin Petersonbac79492012-01-14 13:34:47 -05008174 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01008175 return NULL;
8176 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00008177
Victor Stinner3a50e702011-10-18 21:21:00 +02008178 if (code_page < 0) {
8179 PyErr_SetString(PyExc_ValueError, "invalid code page number");
8180 return NULL;
8181 }
8182
Martin v. Löwis3d325192011-11-04 18:23:06 +01008183 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01008184 return PyBytes_FromStringAndSize(NULL, 0);
8185
Victor Stinner7581cef2011-11-03 22:32:33 +01008186 offset = 0;
8187 do
8188 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008189#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07008190 if (len > DECODING_CHUNK_SIZE) {
8191 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01008192 done = 0;
8193 }
Victor Stinner7581cef2011-11-03 22:32:33 +01008194 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008195#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01008196 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01008197 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01008198 done = 1;
8199 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01008200
Victor Stinner76a31a62011-11-04 00:05:13 +01008201 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008202 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01008203 errors);
8204 if (ret == -2)
8205 ret = encode_code_page_errors(code_page, &outbytes,
8206 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01008207 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01008208 if (ret < 0) {
8209 Py_XDECREF(outbytes);
8210 return NULL;
8211 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008212
Victor Stinner7581cef2011-11-03 22:32:33 +01008213 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01008214 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01008215 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008216
Victor Stinner3a50e702011-10-18 21:21:00 +02008217 return outbytes;
8218}
8219
8220PyObject *
8221PyUnicode_EncodeMBCS(const Py_UNICODE *p,
8222 Py_ssize_t size,
8223 const char *errors)
8224{
Victor Stinner7581cef2011-11-03 22:32:33 +01008225 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008226 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01008227 if (unicode == NULL)
8228 return NULL;
8229 res = encode_code_page(CP_ACP, unicode, errors);
8230 Py_DECREF(unicode);
8231 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02008232}
8233
8234PyObject *
8235PyUnicode_EncodeCodePage(int code_page,
8236 PyObject *unicode,
8237 const char *errors)
8238{
Victor Stinner7581cef2011-11-03 22:32:33 +01008239 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008240}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00008241
Alexander Belopolsky40018472011-02-26 01:02:56 +00008242PyObject *
8243PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008244{
Victor Stinner7581cef2011-11-03 22:32:33 +01008245 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00008246}
8247
Thomas Wouters0e3f5912006-08-11 14:57:12 +00008248#undef NEED_RETRY
8249
Steve Dowercc16be82016-09-08 10:35:16 -07008250#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00008251
Guido van Rossumd57fd912000-03-10 22:53:23 +00008252/* --- Character Mapping Codec -------------------------------------------- */
8253
Victor Stinnerfb161b12013-04-18 01:44:27 +02008254static int
8255charmap_decode_string(const char *s,
8256 Py_ssize_t size,
8257 PyObject *mapping,
8258 const char *errors,
8259 _PyUnicodeWriter *writer)
8260{
8261 const char *starts = s;
8262 const char *e;
8263 Py_ssize_t startinpos, endinpos;
8264 PyObject *errorHandler = NULL, *exc = NULL;
8265 Py_ssize_t maplen;
8266 enum PyUnicode_Kind mapkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008267 const void *mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008268 Py_UCS4 x;
8269 unsigned char ch;
8270
8271 if (PyUnicode_READY(mapping) == -1)
8272 return -1;
8273
8274 maplen = PyUnicode_GET_LENGTH(mapping);
8275 mapdata = PyUnicode_DATA(mapping);
8276 mapkind = PyUnicode_KIND(mapping);
8277
8278 e = s + size;
8279
8280 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
8281 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
8282 * is disabled in encoding aliases, latin1 is preferred because
8283 * its implementation is faster. */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008284 const Py_UCS1 *mapdata_ucs1 = (const Py_UCS1 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008285 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8286 Py_UCS4 maxchar = writer->maxchar;
8287
8288 assert (writer->kind == PyUnicode_1BYTE_KIND);
8289 while (s < e) {
8290 ch = *s;
8291 x = mapdata_ucs1[ch];
8292 if (x > maxchar) {
8293 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
8294 goto onError;
8295 maxchar = writer->maxchar;
8296 outdata = (Py_UCS1 *)writer->data;
8297 }
8298 outdata[writer->pos] = x;
8299 writer->pos++;
8300 ++s;
8301 }
8302 return 0;
8303 }
8304
8305 while (s < e) {
8306 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8307 enum PyUnicode_Kind outkind = writer->kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008308 const Py_UCS2 *mapdata_ucs2 = (const Py_UCS2 *)mapdata;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008309 if (outkind == PyUnicode_1BYTE_KIND) {
8310 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8311 Py_UCS4 maxchar = writer->maxchar;
8312 while (s < e) {
8313 ch = *s;
8314 x = mapdata_ucs2[ch];
8315 if (x > maxchar)
8316 goto Error;
8317 outdata[writer->pos] = x;
8318 writer->pos++;
8319 ++s;
8320 }
8321 break;
8322 }
8323 else if (outkind == PyUnicode_2BYTE_KIND) {
8324 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8325 while (s < e) {
8326 ch = *s;
8327 x = mapdata_ucs2[ch];
8328 if (x == 0xFFFE)
8329 goto Error;
8330 outdata[writer->pos] = x;
8331 writer->pos++;
8332 ++s;
8333 }
8334 break;
8335 }
8336 }
8337 ch = *s;
8338
8339 if (ch < maplen)
8340 x = PyUnicode_READ(mapkind, mapdata, ch);
8341 else
8342 x = 0xfffe; /* invalid value */
8343Error:
8344 if (x == 0xfffe)
8345 {
8346 /* undefined mapping */
8347 startinpos = s-starts;
8348 endinpos = startinpos+1;
8349 if (unicode_decode_call_errorhandler_writer(
8350 errors, &errorHandler,
8351 "charmap", "character maps to <undefined>",
8352 &starts, &e, &startinpos, &endinpos, &exc, &s,
8353 writer)) {
8354 goto onError;
8355 }
8356 continue;
8357 }
8358
8359 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8360 goto onError;
8361 ++s;
8362 }
8363 Py_XDECREF(errorHandler);
8364 Py_XDECREF(exc);
8365 return 0;
8366
8367onError:
8368 Py_XDECREF(errorHandler);
8369 Py_XDECREF(exc);
8370 return -1;
8371}
8372
8373static int
8374charmap_decode_mapping(const char *s,
8375 Py_ssize_t size,
8376 PyObject *mapping,
8377 const char *errors,
8378 _PyUnicodeWriter *writer)
8379{
8380 const char *starts = s;
8381 const char *e;
8382 Py_ssize_t startinpos, endinpos;
8383 PyObject *errorHandler = NULL, *exc = NULL;
8384 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008385 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008386
8387 e = s + size;
8388
8389 while (s < e) {
8390 ch = *s;
8391
8392 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8393 key = PyLong_FromLong((long)ch);
8394 if (key == NULL)
8395 goto onError;
8396
8397 item = PyObject_GetItem(mapping, key);
8398 Py_DECREF(key);
8399 if (item == NULL) {
8400 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8401 /* No mapping found means: mapping is undefined. */
8402 PyErr_Clear();
8403 goto Undefined;
8404 } else
8405 goto onError;
8406 }
8407
8408 /* Apply mapping */
8409 if (item == Py_None)
8410 goto Undefined;
8411 if (PyLong_Check(item)) {
8412 long value = PyLong_AS_LONG(item);
8413 if (value == 0xFFFE)
8414 goto Undefined;
8415 if (value < 0 || value > MAX_UNICODE) {
8416 PyErr_Format(PyExc_TypeError,
Max Bernstein36353882020-10-17 13:38:21 -07008417 "character mapping must be in range(0x%x)",
Victor Stinnerfb161b12013-04-18 01:44:27 +02008418 (unsigned long)MAX_UNICODE + 1);
8419 goto onError;
8420 }
8421
8422 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8423 goto onError;
8424 }
8425 else if (PyUnicode_Check(item)) {
8426 if (PyUnicode_READY(item) == -1)
8427 goto onError;
8428 if (PyUnicode_GET_LENGTH(item) == 1) {
8429 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8430 if (value == 0xFFFE)
8431 goto Undefined;
8432 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8433 goto onError;
8434 }
8435 else {
8436 writer->overallocate = 1;
8437 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8438 goto onError;
8439 }
8440 }
8441 else {
8442 /* wrong return value */
8443 PyErr_SetString(PyExc_TypeError,
8444 "character mapping must return integer, None or str");
8445 goto onError;
8446 }
8447 Py_CLEAR(item);
8448 ++s;
8449 continue;
8450
8451Undefined:
8452 /* undefined mapping */
8453 Py_CLEAR(item);
8454 startinpos = s-starts;
8455 endinpos = startinpos+1;
8456 if (unicode_decode_call_errorhandler_writer(
8457 errors, &errorHandler,
8458 "charmap", "character maps to <undefined>",
8459 &starts, &e, &startinpos, &endinpos, &exc, &s,
8460 writer)) {
8461 goto onError;
8462 }
8463 }
8464 Py_XDECREF(errorHandler);
8465 Py_XDECREF(exc);
8466 return 0;
8467
8468onError:
8469 Py_XDECREF(item);
8470 Py_XDECREF(errorHandler);
8471 Py_XDECREF(exc);
8472 return -1;
8473}
8474
Alexander Belopolsky40018472011-02-26 01:02:56 +00008475PyObject *
8476PyUnicode_DecodeCharmap(const char *s,
8477 Py_ssize_t size,
8478 PyObject *mapping,
8479 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008480{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008481 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008482
Guido van Rossumd57fd912000-03-10 22:53:23 +00008483 /* Default to Latin-1 */
8484 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008485 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008486
Guido van Rossumd57fd912000-03-10 22:53:23 +00008487 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008488 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008489 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008490 writer.min_length = size;
8491 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008492 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008493
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008494 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008495 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8496 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008497 }
8498 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008499 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8500 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008501 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008502 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008503
Benjamin Peterson29060642009-01-31 22:14:21 +00008504 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008505 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008506 return NULL;
8507}
8508
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008509/* Charmap encoding: the lookup table */
8510
Alexander Belopolsky40018472011-02-26 01:02:56 +00008511struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 PyObject_HEAD
8513 unsigned char level1[32];
8514 int count2, count3;
8515 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008516};
8517
8518static PyObject*
8519encoding_map_size(PyObject *obj, PyObject* args)
8520{
8521 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008522 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008523 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008524}
8525
8526static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008527 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008528 PyDoc_STR("Return the size (in bytes) of this object") },
8529 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008530};
8531
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008532static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008533 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008534 "EncodingMap", /*tp_name*/
8535 sizeof(struct encoding_map), /*tp_basicsize*/
8536 0, /*tp_itemsize*/
8537 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008538 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008539 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008540 0, /*tp_getattr*/
8541 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008542 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008543 0, /*tp_repr*/
8544 0, /*tp_as_number*/
8545 0, /*tp_as_sequence*/
8546 0, /*tp_as_mapping*/
8547 0, /*tp_hash*/
8548 0, /*tp_call*/
8549 0, /*tp_str*/
8550 0, /*tp_getattro*/
8551 0, /*tp_setattro*/
8552 0, /*tp_as_buffer*/
8553 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8554 0, /*tp_doc*/
8555 0, /*tp_traverse*/
8556 0, /*tp_clear*/
8557 0, /*tp_richcompare*/
8558 0, /*tp_weaklistoffset*/
8559 0, /*tp_iter*/
8560 0, /*tp_iternext*/
8561 encoding_map_methods, /*tp_methods*/
8562 0, /*tp_members*/
8563 0, /*tp_getset*/
8564 0, /*tp_base*/
8565 0, /*tp_dict*/
8566 0, /*tp_descr_get*/
8567 0, /*tp_descr_set*/
8568 0, /*tp_dictoffset*/
8569 0, /*tp_init*/
8570 0, /*tp_alloc*/
8571 0, /*tp_new*/
8572 0, /*tp_free*/
8573 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008574};
8575
8576PyObject*
8577PyUnicode_BuildEncodingMap(PyObject* string)
8578{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008579 PyObject *result;
8580 struct encoding_map *mresult;
8581 int i;
8582 int need_dict = 0;
8583 unsigned char level1[32];
8584 unsigned char level2[512];
8585 unsigned char *mlevel1, *mlevel2, *mlevel3;
8586 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008587 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008588 const void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008589 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008590 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008591
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008592 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008593 PyErr_BadArgument();
8594 return NULL;
8595 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008596 kind = PyUnicode_KIND(string);
8597 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008598 length = PyUnicode_GET_LENGTH(string);
8599 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008600 memset(level1, 0xFF, sizeof level1);
8601 memset(level2, 0xFF, sizeof level2);
8602
8603 /* If there isn't a one-to-one mapping of NULL to \0,
8604 or if there are non-BMP characters, we need to use
8605 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008606 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008607 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008608 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008609 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008610 ch = PyUnicode_READ(kind, data, i);
8611 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008612 need_dict = 1;
8613 break;
8614 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008615 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008616 /* unmapped character */
8617 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008618 l1 = ch >> 11;
8619 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008620 if (level1[l1] == 0xFF)
8621 level1[l1] = count2++;
8622 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008623 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008624 }
8625
8626 if (count2 >= 0xFF || count3 >= 0xFF)
8627 need_dict = 1;
8628
8629 if (need_dict) {
8630 PyObject *result = PyDict_New();
8631 PyObject *key, *value;
8632 if (!result)
8633 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008634 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008635 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008636 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008637 if (!key || !value)
8638 goto failed1;
8639 if (PyDict_SetItem(result, key, value) == -1)
8640 goto failed1;
8641 Py_DECREF(key);
8642 Py_DECREF(value);
8643 }
8644 return result;
8645 failed1:
8646 Py_XDECREF(key);
8647 Py_XDECREF(value);
8648 Py_DECREF(result);
8649 return NULL;
8650 }
8651
8652 /* Create a three-level trie */
Victor Stinner32bd68c2020-12-01 10:37:39 +01008653 result = PyObject_Malloc(sizeof(struct encoding_map) +
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008654 16*count2 + 128*count3 - 1);
Victor Stinner04fc4f22020-06-16 01:28:07 +02008655 if (!result) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008656 return PyErr_NoMemory();
Victor Stinner04fc4f22020-06-16 01:28:07 +02008657 }
8658
8659 _PyObject_Init(result, &EncodingMapType);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008660 mresult = (struct encoding_map*)result;
8661 mresult->count2 = count2;
8662 mresult->count3 = count3;
8663 mlevel1 = mresult->level1;
8664 mlevel2 = mresult->level23;
8665 mlevel3 = mresult->level23 + 16*count2;
8666 memcpy(mlevel1, level1, 32);
8667 memset(mlevel2, 0xFF, 16*count2);
8668 memset(mlevel3, 0, 128*count3);
8669 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008670 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008671 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008672 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8673 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008674 /* unmapped character */
8675 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008676 o1 = ch>>11;
8677 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008678 i2 = 16*mlevel1[o1] + o2;
8679 if (mlevel2[i2] == 0xFF)
8680 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008681 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008682 i3 = 128*mlevel2[i2] + o3;
8683 mlevel3[i3] = i;
8684 }
8685 return result;
8686}
8687
8688static int
Victor Stinner22168992011-11-20 17:09:18 +01008689encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008690{
8691 struct encoding_map *map = (struct encoding_map*)mapping;
8692 int l1 = c>>11;
8693 int l2 = (c>>7) & 0xF;
8694 int l3 = c & 0x7F;
8695 int i;
8696
Victor Stinner22168992011-11-20 17:09:18 +01008697 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008699 if (c == 0)
8700 return 0;
8701 /* level 1*/
8702 i = map->level1[l1];
8703 if (i == 0xFF) {
8704 return -1;
8705 }
8706 /* level 2*/
8707 i = map->level23[16*i+l2];
8708 if (i == 0xFF) {
8709 return -1;
8710 }
8711 /* level 3 */
8712 i = map->level23[16*map->count2 + 128*i + l3];
8713 if (i == 0) {
8714 return -1;
8715 }
8716 return i;
8717}
8718
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008719/* Lookup the character ch in the mapping. If the character
8720 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008721 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008722static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008723charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008724{
Christian Heimes217cfd12007-12-02 14:31:20 +00008725 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008726 PyObject *x;
8727
8728 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008729 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008730 x = PyObject_GetItem(mapping, w);
8731 Py_DECREF(w);
8732 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008733 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8734 /* No mapping found means: mapping is undefined. */
8735 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008736 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008737 } else
8738 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008739 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008740 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008741 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008742 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008743 long value = PyLong_AS_LONG(x);
8744 if (value < 0 || value > 255) {
8745 PyErr_SetString(PyExc_TypeError,
8746 "character mapping must be in range(256)");
8747 Py_DECREF(x);
8748 return NULL;
8749 }
8750 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008751 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008752 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008753 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008754 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008755 /* wrong return value */
8756 PyErr_Format(PyExc_TypeError,
8757 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008758 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008759 Py_DECREF(x);
8760 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008761 }
8762}
8763
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008764static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008765charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008766{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008767 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8768 /* exponentially overallocate to minimize reallocations */
8769 if (requiredsize < 2*outsize)
8770 requiredsize = 2*outsize;
8771 if (_PyBytes_Resize(outobj, requiredsize))
8772 return -1;
8773 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008774}
8775
Benjamin Peterson14339b62009-01-31 16:36:08 +00008776typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008777 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008778} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008779/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008780 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008781 space is available. Return a new reference to the object that
8782 was put in the output buffer, or Py_None, if the mapping was undefined
8783 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008784 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008785static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008786charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008787 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008788{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008789 PyObject *rep;
8790 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008791 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008792
Andy Lesterdffe4c02020-03-04 07:15:20 -06008793 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008794 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008795 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008796 if (res == -1)
8797 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008798 if (outsize<requiredsize)
8799 if (charmapencode_resize(outobj, outpos, requiredsize))
8800 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008801 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008802 outstart[(*outpos)++] = (char)res;
8803 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008804 }
8805
8806 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008807 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008808 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008809 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008810 Py_DECREF(rep);
8811 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008812 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008813 if (PyLong_Check(rep)) {
8814 Py_ssize_t requiredsize = *outpos+1;
8815 if (outsize<requiredsize)
8816 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8817 Py_DECREF(rep);
8818 return enc_EXCEPTION;
8819 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008820 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008821 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008822 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008823 else {
8824 const char *repchars = PyBytes_AS_STRING(rep);
8825 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8826 Py_ssize_t requiredsize = *outpos+repsize;
8827 if (outsize<requiredsize)
8828 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8829 Py_DECREF(rep);
8830 return enc_EXCEPTION;
8831 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008832 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008833 memcpy(outstart + *outpos, repchars, repsize);
8834 *outpos += repsize;
8835 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008836 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008837 Py_DECREF(rep);
8838 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008839}
8840
8841/* handle an error in PyUnicode_EncodeCharmap
8842 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008843static int
8844charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008845 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008846 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008847 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008848 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008849{
8850 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008851 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008852 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008853 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03008854 const void *data;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008855 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008856 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008857 Py_ssize_t collstartpos = *inpos;
8858 Py_ssize_t collendpos = *inpos+1;
8859 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008860 const char *encoding = "charmap";
8861 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008862 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008863 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008864 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008865
Benjamin Petersonbac79492012-01-14 13:34:47 -05008866 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008867 return -1;
8868 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008869 /* find all unencodable characters */
8870 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008871 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008872 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008873 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008874 val = encoding_map_lookup(ch, mapping);
8875 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008876 break;
8877 ++collendpos;
8878 continue;
8879 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008880
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008881 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8882 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008883 if (rep==NULL)
8884 return -1;
8885 else if (rep!=Py_None) {
8886 Py_DECREF(rep);
8887 break;
8888 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008889 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008890 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008891 }
8892 /* cache callback name lookup
8893 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008894 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008895 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008896
8897 switch (*error_handler) {
8898 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008899 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008900 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008901
8902 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008903 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008904 x = charmapencode_output('?', mapping, res, respos);
8905 if (x==enc_EXCEPTION) {
8906 return -1;
8907 }
8908 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008909 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008910 return -1;
8911 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008912 }
8913 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008914 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008915 *inpos = collendpos;
8916 break;
Victor Stinner50149202015-09-22 00:26:54 +02008917
8918 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008919 /* generate replacement (temporarily (mis)uses p) */
8920 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008921 char buffer[2+29+1+1];
8922 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008923 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008924 for (cp = buffer; *cp; ++cp) {
8925 x = charmapencode_output(*cp, mapping, res, respos);
8926 if (x==enc_EXCEPTION)
8927 return -1;
8928 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008929 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008930 return -1;
8931 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008932 }
8933 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008934 *inpos = collendpos;
8935 break;
Victor Stinner50149202015-09-22 00:26:54 +02008936
Benjamin Peterson14339b62009-01-31 16:36:08 +00008937 default:
Victor Stinner50149202015-09-22 00:26:54 +02008938 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008939 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008940 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008941 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008942 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008943 if (PyBytes_Check(repunicode)) {
8944 /* Directly copy bytes result to output. */
8945 Py_ssize_t outsize = PyBytes_Size(*res);
8946 Py_ssize_t requiredsize;
8947 repsize = PyBytes_Size(repunicode);
8948 requiredsize = *respos + repsize;
8949 if (requiredsize > outsize)
8950 /* Make room for all additional bytes. */
8951 if (charmapencode_resize(res, respos, requiredsize)) {
8952 Py_DECREF(repunicode);
8953 return -1;
8954 }
8955 memcpy(PyBytes_AsString(*res) + *respos,
8956 PyBytes_AsString(repunicode), repsize);
8957 *respos += repsize;
8958 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008959 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008960 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008961 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008962 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008963 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008964 Py_DECREF(repunicode);
8965 return -1;
8966 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008967 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008968 data = PyUnicode_DATA(repunicode);
8969 kind = PyUnicode_KIND(repunicode);
8970 for (index = 0; index < repsize; index++) {
8971 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8972 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008973 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008974 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008975 return -1;
8976 }
8977 else if (x==enc_FAILED) {
8978 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008979 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008980 return -1;
8981 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008982 }
8983 *inpos = newpos;
8984 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008985 }
8986 return 0;
8987}
8988
Alexander Belopolsky40018472011-02-26 01:02:56 +00008989PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008990_PyUnicode_EncodeCharmap(PyObject *unicode,
8991 PyObject *mapping,
8992 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008994 /* output object */
8995 PyObject *res = NULL;
8996 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008997 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008998 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008999 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00009000 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02009001 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009002 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02009003 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009004 const void *data;
Victor Stinner69ed0f42013-04-09 21:48:24 +02009005 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009006
Benjamin Petersonbac79492012-01-14 13:34:47 -05009007 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009008 return NULL;
9009 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02009010 data = PyUnicode_DATA(unicode);
9011 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009012
Guido van Rossumd57fd912000-03-10 22:53:23 +00009013 /* Default to Latin-1 */
9014 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009015 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009016
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009017 /* allocate enough for a simple encoding without
9018 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00009019 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009020 if (res == NULL)
9021 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00009022 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009023 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009024
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009025 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02009026 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00009027 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009028 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00009029 if (x==enc_EXCEPTION) /* error */
9030 goto onError;
9031 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009032 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00009033 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02009034 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00009035 &res, &respos)) {
9036 goto onError;
9037 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00009038 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009039 else
9040 /* done with this character => adjust input position */
9041 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009042 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009043
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009044 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00009045 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00009046 if (_PyBytes_Resize(&res, respos) < 0)
9047 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00009048
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009049 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02009050 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009051 return res;
9052
Benjamin Peterson29060642009-01-31 22:14:21 +00009053 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009054 Py_XDECREF(res);
9055 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02009056 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009057 return NULL;
9058}
9059
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009060/* Deprecated */
9061PyObject *
9062PyUnicode_EncodeCharmap(const Py_UNICODE *p,
9063 Py_ssize_t size,
9064 PyObject *mapping,
9065 const char *errors)
9066{
9067 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009068 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009069 if (unicode == NULL)
9070 return NULL;
9071 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
9072 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01009073 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009074}
9075
Alexander Belopolsky40018472011-02-26 01:02:56 +00009076PyObject *
9077PyUnicode_AsCharmapString(PyObject *unicode,
9078 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009079{
9080 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009081 PyErr_BadArgument();
9082 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009083 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009084 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009085}
9086
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009087/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009088static void
9089make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009090 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009091 Py_ssize_t startpos, Py_ssize_t endpos,
9092 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009093{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009094 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009095 *exceptionObject = _PyUnicodeTranslateError_Create(
9096 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009097 }
9098 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00009099 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
9100 goto onError;
9101 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
9102 goto onError;
9103 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
9104 goto onError;
9105 return;
9106 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02009107 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009108 }
9109}
9110
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009111/* error handling callback helper:
9112 build arguments, call the callback and check the arguments,
9113 put the result into newpos and return the replacement string, which
9114 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009115static PyObject *
9116unicode_translate_call_errorhandler(const char *errors,
9117 PyObject **errorHandler,
9118 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009119 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009120 Py_ssize_t startpos, Py_ssize_t endpos,
9121 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009122{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009123 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009124
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00009125 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009126 PyObject *restuple;
9127 PyObject *resunicode;
9128
9129 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009130 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009131 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009132 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009133 }
9134
9135 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009136 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009137 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009138 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009139
Petr Viktorinffd97532020-02-11 17:46:57 +01009140 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009141 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009142 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009143 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009144 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00009145 Py_DECREF(restuple);
9146 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009147 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03009148 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00009149 &resunicode, &i_newpos)) {
9150 Py_DECREF(restuple);
9151 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009152 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00009153 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009154 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00009155 else
9156 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009157 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02009158 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00009159 Py_DECREF(restuple);
9160 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00009161 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009162 Py_INCREF(resunicode);
9163 Py_DECREF(restuple);
9164 return resunicode;
9165}
9166
9167/* Lookup the character ch in the mapping and put the result in result,
9168 which must be decrefed by the caller.
9169 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009170static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009171charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009172{
Christian Heimes217cfd12007-12-02 14:31:20 +00009173 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009174 PyObject *x;
9175
9176 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00009177 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009178 x = PyObject_GetItem(mapping, w);
9179 Py_DECREF(w);
9180 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009181 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
9182 /* No mapping found means: use 1:1 mapping. */
9183 PyErr_Clear();
9184 *result = NULL;
9185 return 0;
9186 } else
9187 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009188 }
9189 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009190 *result = x;
9191 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009192 }
Christian Heimes217cfd12007-12-02 14:31:20 +00009193 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009194 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009195 if (value < 0 || value > MAX_UNICODE) {
9196 PyErr_Format(PyExc_ValueError,
9197 "character mapping must be in range(0x%x)",
9198 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00009199 Py_DECREF(x);
9200 return -1;
9201 }
9202 *result = x;
9203 return 0;
9204 }
9205 else if (PyUnicode_Check(x)) {
9206 *result = x;
9207 return 0;
9208 }
9209 else {
9210 /* wrong return value */
9211 PyErr_SetString(PyExc_TypeError,
9212 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00009213 Py_DECREF(x);
9214 return -1;
9215 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009216}
Victor Stinner1194ea02014-04-04 19:37:40 +02009217
9218/* lookup the character, write the result into the writer.
9219 Return 1 if the result was written into the writer, return 0 if the mapping
9220 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00009221static int
Victor Stinner1194ea02014-04-04 19:37:40 +02009222charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
9223 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009224{
Victor Stinner1194ea02014-04-04 19:37:40 +02009225 PyObject *item;
9226
9227 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00009228 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009229
9230 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009231 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02009232 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009233 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00009234 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009235 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009236 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009237
9238 if (item == Py_None) {
9239 Py_DECREF(item);
9240 return 0;
9241 }
9242
9243 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02009244 long ch = (Py_UCS4)PyLong_AS_LONG(item);
9245 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9246 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009247 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
9248 Py_DECREF(item);
9249 return -1;
9250 }
9251 Py_DECREF(item);
9252 return 1;
9253 }
9254
9255 if (!PyUnicode_Check(item)) {
9256 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00009257 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02009258 }
9259
9260 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
9261 Py_DECREF(item);
9262 return -1;
9263 }
9264
9265 Py_DECREF(item);
9266 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009267}
9268
Victor Stinner89a76ab2014-04-05 11:44:04 +02009269static int
9270unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
9271 Py_UCS1 *translate)
9272{
Benjamin Peterson1365de72014-04-07 20:15:41 -04009273 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009274 int ret = 0;
9275
Victor Stinner89a76ab2014-04-05 11:44:04 +02009276 if (charmaptranslate_lookup(ch, mapping, &item)) {
9277 return -1;
9278 }
9279
9280 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009281 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02009282 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009283 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009284 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009285 /* not found => default to 1:1 mapping */
9286 translate[ch] = ch;
9287 return 1;
9288 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04009289 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02009290 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02009291 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
9292 used it */
9293 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009294 /* invalid character or character outside ASCII:
9295 skip the fast translate */
9296 goto exit;
9297 }
9298 translate[ch] = (Py_UCS1)replace;
9299 }
9300 else if (PyUnicode_Check(item)) {
9301 Py_UCS4 replace;
9302
9303 if (PyUnicode_READY(item) == -1) {
9304 Py_DECREF(item);
9305 return -1;
9306 }
9307 if (PyUnicode_GET_LENGTH(item) != 1)
9308 goto exit;
9309
9310 replace = PyUnicode_READ_CHAR(item, 0);
9311 if (replace > 127)
9312 goto exit;
9313 translate[ch] = (Py_UCS1)replace;
9314 }
9315 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009316 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009317 goto exit;
9318 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009319 ret = 1;
9320
Benjamin Peterson1365de72014-04-07 20:15:41 -04009321 exit:
9322 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009323 return ret;
9324}
9325
9326/* Fast path for ascii => ascii translation. Return 1 if the whole string
9327 was translated into writer, return 0 if the input string was partially
9328 translated into writer, raise an exception and return -1 on error. */
9329static int
9330unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009331 _PyUnicodeWriter *writer, int ignore,
9332 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009333{
Victor Stinner872b2912014-04-05 14:27:07 +02009334 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009335 Py_ssize_t len;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009336 const Py_UCS1 *in, *end;
9337 Py_UCS1 *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009338 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009339
Victor Stinner89a76ab2014-04-05 11:44:04 +02009340 len = PyUnicode_GET_LENGTH(input);
9341
Victor Stinner872b2912014-04-05 14:27:07 +02009342 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009343
9344 in = PyUnicode_1BYTE_DATA(input);
9345 end = in + len;
9346
9347 assert(PyUnicode_IS_ASCII(writer->buffer));
9348 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9349 out = PyUnicode_1BYTE_DATA(writer->buffer);
9350
Victor Stinner872b2912014-04-05 14:27:07 +02009351 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009352 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009353 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009354 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009355 int translate = unicode_fast_translate_lookup(mapping, ch,
9356 ascii_table);
9357 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009358 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009359 if (translate == 0)
9360 goto exit;
9361 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009362 }
Victor Stinner872b2912014-04-05 14:27:07 +02009363 if (ch2 == 0xfe) {
9364 if (ignore)
9365 continue;
9366 goto exit;
9367 }
9368 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009369 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009370 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009371 }
Victor Stinner872b2912014-04-05 14:27:07 +02009372 res = 1;
9373
9374exit:
9375 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009376 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009377 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009378}
9379
Victor Stinner3222da22015-10-01 22:07:32 +02009380static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009381_PyUnicode_TranslateCharmap(PyObject *input,
9382 PyObject *mapping,
9383 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009384{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009385 /* input object */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009386 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009387 Py_ssize_t size, i;
9388 int kind;
9389 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009390 _PyUnicodeWriter writer;
9391 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009392 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009393 PyObject *errorHandler = NULL;
9394 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009395 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009396 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009397
Guido van Rossumd57fd912000-03-10 22:53:23 +00009398 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009399 PyErr_BadArgument();
9400 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009401 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009402
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009403 if (PyUnicode_READY(input) == -1)
9404 return NULL;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009405 data = PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009406 kind = PyUnicode_KIND(input);
9407 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009408
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009409 if (size == 0)
9410 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009411
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009412 /* allocate enough for a simple 1:1 translation without
9413 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009414 _PyUnicodeWriter_Init(&writer);
9415 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009416 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009417
Victor Stinner872b2912014-04-05 14:27:07 +02009418 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9419
Victor Stinner33798672016-03-01 21:59:58 +01009420 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009421 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009422 if (PyUnicode_IS_ASCII(input)) {
9423 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9424 if (res < 0) {
9425 _PyUnicodeWriter_Dealloc(&writer);
9426 return NULL;
9427 }
9428 if (res == 1)
9429 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009430 }
Victor Stinner33798672016-03-01 21:59:58 +01009431 else {
9432 i = 0;
9433 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009434
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009435 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009436 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009437 int translate;
9438 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9439 Py_ssize_t newpos;
9440 /* startpos for collecting untranslatable chars */
9441 Py_ssize_t collstart;
9442 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009443 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009444
Victor Stinner1194ea02014-04-04 19:37:40 +02009445 ch = PyUnicode_READ(kind, data, i);
9446 translate = charmaptranslate_output(ch, mapping, &writer);
9447 if (translate < 0)
9448 goto onError;
9449
9450 if (translate != 0) {
9451 /* it worked => adjust input pointer */
9452 ++i;
9453 continue;
9454 }
9455
9456 /* untranslatable character */
9457 collstart = i;
9458 collend = i+1;
9459
9460 /* find all untranslatable characters */
9461 while (collend < size) {
9462 PyObject *x;
9463 ch = PyUnicode_READ(kind, data, collend);
9464 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009465 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009466 Py_XDECREF(x);
9467 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009468 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009469 ++collend;
9470 }
9471
9472 if (ignore) {
9473 i = collend;
9474 }
9475 else {
9476 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9477 reason, input, &exc,
9478 collstart, collend, &newpos);
9479 if (repunicode == NULL)
9480 goto onError;
9481 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009482 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009483 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009484 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009485 Py_DECREF(repunicode);
9486 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009487 }
9488 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009489 Py_XDECREF(exc);
9490 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009491 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009492
Benjamin Peterson29060642009-01-31 22:14:21 +00009493 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009494 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009495 Py_XDECREF(exc);
9496 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009497 return NULL;
9498}
9499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009500/* Deprecated. Use PyUnicode_Translate instead. */
9501PyObject *
9502PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9503 Py_ssize_t size,
9504 PyObject *mapping,
9505 const char *errors)
9506{
Christian Heimes5f520f42012-09-11 14:03:25 +02009507 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009508 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009509 if (!unicode)
9510 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009511 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9512 Py_DECREF(unicode);
9513 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514}
9515
Alexander Belopolsky40018472011-02-26 01:02:56 +00009516PyObject *
9517PyUnicode_Translate(PyObject *str,
9518 PyObject *mapping,
9519 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009520{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009521 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009522 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009523 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009524}
Tim Petersced69f82003-09-16 20:30:58 +00009525
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009526PyObject *
9527_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9528{
9529 if (!PyUnicode_Check(unicode)) {
9530 PyErr_BadInternalCall();
9531 return NULL;
9532 }
9533 if (PyUnicode_READY(unicode) == -1)
9534 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009535 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009536 /* If the string is already ASCII, just return the same string */
9537 Py_INCREF(unicode);
9538 return unicode;
9539 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009540
9541 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9542 PyObject *result = PyUnicode_New(len, 127);
9543 if (result == NULL) {
9544 return NULL;
9545 }
9546
9547 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9548 int kind = PyUnicode_KIND(unicode);
9549 const void *data = PyUnicode_DATA(unicode);
9550 Py_ssize_t i;
9551 for (i = 0; i < len; ++i) {
9552 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9553 if (ch < 127) {
9554 out[i] = ch;
9555 }
9556 else if (Py_UNICODE_ISSPACE(ch)) {
9557 out[i] = ' ';
9558 }
9559 else {
9560 int decimal = Py_UNICODE_TODECIMAL(ch);
9561 if (decimal < 0) {
9562 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009563 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009564 _PyUnicode_LENGTH(result) = i + 1;
9565 break;
9566 }
9567 out[i] = '0' + decimal;
9568 }
9569 }
9570
INADA Naoki16dfca42018-07-14 12:06:43 +09009571 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009572 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009573}
9574
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009575PyObject *
9576PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9577 Py_ssize_t length)
9578{
Victor Stinnerf0124502011-11-21 23:12:56 +01009579 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009580 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009581 Py_UCS4 maxchar;
9582 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009583 const void *data;
Victor Stinnerf0124502011-11-21 23:12:56 +01009584
Victor Stinner99d7ad02012-02-22 13:37:39 +01009585 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009586 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009587 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009588 if (ch > 127) {
9589 int decimal = Py_UNICODE_TODECIMAL(ch);
9590 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009591 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009592 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009593 }
9594 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009595
9596 /* Copy to a new string */
9597 decimal = PyUnicode_New(length, maxchar);
9598 if (decimal == NULL)
9599 return decimal;
9600 kind = PyUnicode_KIND(decimal);
9601 data = PyUnicode_DATA(decimal);
9602 /* Iterate over code points */
9603 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009604 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009605 if (ch > 127) {
9606 int decimal = Py_UNICODE_TODECIMAL(ch);
9607 if (decimal >= 0)
9608 ch = '0' + decimal;
9609 }
9610 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009611 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009612 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009613}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009614/* --- Decimal Encoder ---------------------------------------------------- */
9615
Alexander Belopolsky40018472011-02-26 01:02:56 +00009616int
9617PyUnicode_EncodeDecimal(Py_UNICODE *s,
9618 Py_ssize_t length,
9619 char *output,
9620 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009621{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009622 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009623 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009624 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009625 const void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009626
9627 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009628 PyErr_BadArgument();
9629 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009630 }
9631
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009632 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009633 if (unicode == NULL)
9634 return -1;
9635
Victor Stinner42bf7752011-11-21 22:52:58 +01009636 kind = PyUnicode_KIND(unicode);
9637 data = PyUnicode_DATA(unicode);
9638
Victor Stinnerb84d7232011-11-22 01:50:07 +01009639 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009640 PyObject *exc;
9641 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009642 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009643 Py_ssize_t startpos;
9644
9645 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009646
Benjamin Peterson29060642009-01-31 22:14:21 +00009647 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009648 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009649 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009650 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009651 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009652 decimal = Py_UNICODE_TODECIMAL(ch);
9653 if (decimal >= 0) {
9654 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009655 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009656 continue;
9657 }
9658 if (0 < ch && ch < 256) {
9659 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009660 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009661 continue;
9662 }
Victor Stinner6345be92011-11-25 20:09:01 +01009663
Victor Stinner42bf7752011-11-21 22:52:58 +01009664 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009665 exc = NULL;
9666 raise_encode_exception(&exc, "decimal", unicode,
9667 startpos, startpos+1,
9668 "invalid decimal Unicode string");
9669 Py_XDECREF(exc);
9670 Py_DECREF(unicode);
9671 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009672 }
9673 /* 0-terminate the output string */
9674 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009675 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009676 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009677}
9678
Guido van Rossumd57fd912000-03-10 22:53:23 +00009679/* --- Helpers ------------------------------------------------------------ */
9680
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009681/* helper macro to fixup start/end slice values */
9682#define ADJUST_INDICES(start, end, len) \
9683 if (end > len) \
9684 end = len; \
9685 else if (end < 0) { \
9686 end += len; \
9687 if (end < 0) \
9688 end = 0; \
9689 } \
9690 if (start < 0) { \
9691 start += len; \
9692 if (start < 0) \
9693 start = 0; \
9694 }
9695
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009696static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009697any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009698 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009699 Py_ssize_t end,
9700 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009701{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009702 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009703 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009704 Py_ssize_t len1, len2, result;
9705
9706 kind1 = PyUnicode_KIND(s1);
9707 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009708 if (kind1 < kind2)
9709 return -1;
9710
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009711 len1 = PyUnicode_GET_LENGTH(s1);
9712 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009713 ADJUST_INDICES(start, end, len1);
9714 if (end - start < len2)
9715 return -1;
9716
9717 buf1 = PyUnicode_DATA(s1);
9718 buf2 = PyUnicode_DATA(s2);
9719 if (len2 == 1) {
9720 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9721 result = findchar((const char *)buf1 + kind1*start,
9722 kind1, end - start, ch, direction);
9723 if (result == -1)
9724 return -1;
9725 else
9726 return start + result;
9727 }
9728
9729 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009730 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009731 if (!buf2)
9732 return -2;
9733 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009734
Victor Stinner794d5672011-10-10 03:21:36 +02009735 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009736 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009737 case PyUnicode_1BYTE_KIND:
9738 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9739 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9740 else
9741 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9742 break;
9743 case PyUnicode_2BYTE_KIND:
9744 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9745 break;
9746 case PyUnicode_4BYTE_KIND:
9747 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9748 break;
9749 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009750 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009751 }
9752 }
9753 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009754 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009755 case PyUnicode_1BYTE_KIND:
9756 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9757 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9758 else
9759 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9760 break;
9761 case PyUnicode_2BYTE_KIND:
9762 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9763 break;
9764 case PyUnicode_4BYTE_KIND:
9765 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9766 break;
9767 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009768 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009769 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009770 }
9771
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009772 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(s2)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009773 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009774 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009775
9776 return result;
9777}
9778
Victor Stinner59423e32018-11-26 13:40:01 +01009779/* _PyUnicode_InsertThousandsGrouping() helper functions */
9780#include "stringlib/localeutil.h"
9781
9782/**
9783 * InsertThousandsGrouping:
9784 * @writer: Unicode writer.
9785 * @n_buffer: Number of characters in @buffer.
9786 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9787 * @d_pos: Start of digits string.
9788 * @n_digits: The number of digits in the string, in which we want
9789 * to put the grouping chars.
9790 * @min_width: The minimum width of the digits in the output string.
9791 * Output will be zero-padded on the left to fill.
9792 * @grouping: see definition in localeconv().
9793 * @thousands_sep: see definition in localeconv().
9794 *
9795 * There are 2 modes: counting and filling. If @writer is NULL,
9796 * we are in counting mode, else filling mode.
9797 * If counting, the required buffer size is returned.
9798 * If filling, we know the buffer will be large enough, so we don't
9799 * need to pass in the buffer size.
9800 * Inserts thousand grouping characters (as defined by grouping and
9801 * thousands_sep) into @writer.
9802 *
9803 * Return value: -1 on error, number of characters otherwise.
9804 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009805Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009806_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009807 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009808 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009809 PyObject *digits,
9810 Py_ssize_t d_pos,
9811 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009812 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009813 const char *grouping,
9814 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009815 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009816{
Xtreak3f7983a2019-01-07 20:39:14 +05309817 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009818 if (writer) {
9819 assert(digits != NULL);
9820 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009821 }
9822 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009823 assert(digits == NULL);
9824 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009825 }
Victor Stinner59423e32018-11-26 13:40:01 +01009826 assert(0 <= d_pos);
9827 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009828 assert(grouping != NULL);
9829
9830 if (digits != NULL) {
9831 if (PyUnicode_READY(digits) == -1) {
9832 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009833 }
Victor Stinner59423e32018-11-26 13:40:01 +01009834 }
9835 if (PyUnicode_READY(thousands_sep) == -1) {
9836 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009837 }
9838
Victor Stinner59423e32018-11-26 13:40:01 +01009839 Py_ssize_t count = 0;
9840 Py_ssize_t n_zeros;
9841 int loop_broken = 0;
9842 int use_separator = 0; /* First time through, don't append the
9843 separator. They only go between
9844 groups. */
9845 Py_ssize_t buffer_pos;
9846 Py_ssize_t digits_pos;
9847 Py_ssize_t len;
9848 Py_ssize_t n_chars;
9849 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9850 be looked at */
9851 /* A generator that returns all of the grouping widths, until it
9852 returns 0. */
9853 GroupGenerator groupgen;
9854 GroupGenerator_init(&groupgen, grouping);
9855 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9856
9857 /* if digits are not grouped, thousands separator
9858 should be an empty string */
9859 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9860
9861 digits_pos = d_pos + n_digits;
9862 if (writer) {
9863 buffer_pos = writer->pos + n_buffer;
9864 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9865 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009866 }
Victor Stinner59423e32018-11-26 13:40:01 +01009867 else {
9868 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009869 }
Victor Stinner59423e32018-11-26 13:40:01 +01009870
9871 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009872 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009873 }
Victor Stinner59423e32018-11-26 13:40:01 +01009874
9875 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9876 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9877 n_zeros = Py_MAX(0, len - remaining);
9878 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9879
9880 /* Use n_zero zero's and n_chars chars */
9881
9882 /* Count only, don't do anything. */
9883 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9884
9885 /* Copy into the writer. */
9886 InsertThousandsGrouping_fill(writer, &buffer_pos,
9887 digits, &digits_pos,
9888 n_chars, n_zeros,
9889 use_separator ? thousands_sep : NULL,
9890 thousands_sep_len, maxchar);
9891
9892 /* Use a separator next time. */
9893 use_separator = 1;
9894
9895 remaining -= n_chars;
9896 min_width -= len;
9897
9898 if (remaining <= 0 && min_width <= 0) {
9899 loop_broken = 1;
9900 break;
9901 }
9902 min_width -= thousands_sep_len;
9903 }
9904 if (!loop_broken) {
9905 /* We left the loop without using a break statement. */
9906
9907 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9908 n_zeros = Py_MAX(0, len - remaining);
9909 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9910
9911 /* Use n_zero zero's and n_chars chars */
9912 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9913
9914 /* Copy into the writer. */
9915 InsertThousandsGrouping_fill(writer, &buffer_pos,
9916 digits, &digits_pos,
9917 n_chars, n_zeros,
9918 use_separator ? thousands_sep : NULL,
9919 thousands_sep_len, maxchar);
9920 }
9921 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009922}
9923
9924
Alexander Belopolsky40018472011-02-26 01:02:56 +00009925Py_ssize_t
9926PyUnicode_Count(PyObject *str,
9927 PyObject *substr,
9928 Py_ssize_t start,
9929 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009930{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009931 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009932 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009933 const void *buf1 = NULL, *buf2 = NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009934 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009935
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009936 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009937 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009938
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009939 kind1 = PyUnicode_KIND(str);
9940 kind2 = PyUnicode_KIND(substr);
9941 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009942 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009943
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009944 len1 = PyUnicode_GET_LENGTH(str);
9945 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009946 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009947 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009948 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009949
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009950 buf1 = PyUnicode_DATA(str);
9951 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009952 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009953 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009954 if (!buf2)
9955 goto onError;
9956 }
9957
9958 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009959 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009960 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009961 result = asciilib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009962 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009963 buf2, len2, PY_SSIZE_T_MAX
9964 );
9965 else
9966 result = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009967 ((const Py_UCS1*)buf1) + start, end - start,
Victor Stinnerc3cec782011-10-05 21:24:08 +02009968 buf2, len2, PY_SSIZE_T_MAX
9969 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009970 break;
9971 case PyUnicode_2BYTE_KIND:
9972 result = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009973 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009974 buf2, len2, PY_SSIZE_T_MAX
9975 );
9976 break;
9977 case PyUnicode_4BYTE_KIND:
9978 result = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009979 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009980 buf2, len2, PY_SSIZE_T_MAX
9981 );
9982 break;
9983 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009984 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009985 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009986
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009987 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009988 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009989 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009990
Guido van Rossumd57fd912000-03-10 22:53:23 +00009991 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009992 onError:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +03009993 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
9994 if (kind2 != kind1)
9995 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009996 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009997}
9998
Alexander Belopolsky40018472011-02-26 01:02:56 +00009999Py_ssize_t
10000PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010001 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010002 Py_ssize_t start,
10003 Py_ssize_t end,
10004 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010005{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010006 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010007 return -2;
Tim Petersced69f82003-09-16 20:30:58 +000010008
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010009 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010010}
10011
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010012Py_ssize_t
10013PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
10014 Py_ssize_t start, Py_ssize_t end,
10015 int direction)
10016{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010017 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +080010018 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010019 if (PyUnicode_READY(str) == -1)
10020 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +080010021 len = PyUnicode_GET_LENGTH(str);
10022 ADJUST_INDICES(start, end, len);
10023 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010024 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010025 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010026 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
10027 kind, end-start, ch, direction);
10028 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010029 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +020010030 else
10031 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010032}
10033
Alexander Belopolsky40018472011-02-26 01:02:56 +000010034static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010035tailmatch(PyObject *self,
10036 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010037 Py_ssize_t start,
10038 Py_ssize_t end,
10039 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010040{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010041 int kind_self;
10042 int kind_sub;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010043 const void *data_self;
10044 const void *data_sub;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010045 Py_ssize_t offset;
10046 Py_ssize_t i;
10047 Py_ssize_t end_sub;
10048
10049 if (PyUnicode_READY(self) == -1 ||
10050 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +010010051 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010052
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010053 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
10054 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010055 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +000010056 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010057
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +030010058 if (PyUnicode_GET_LENGTH(substring) == 0)
10059 return 1;
10060
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010061 kind_self = PyUnicode_KIND(self);
10062 data_self = PyUnicode_DATA(self);
10063 kind_sub = PyUnicode_KIND(substring);
10064 data_sub = PyUnicode_DATA(substring);
10065 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
10066
10067 if (direction > 0)
10068 offset = end;
10069 else
10070 offset = start;
10071
10072 if (PyUnicode_READ(kind_self, data_self, offset) ==
10073 PyUnicode_READ(kind_sub, data_sub, 0) &&
10074 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
10075 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
10076 /* If both are of the same kind, memcmp is sufficient */
10077 if (kind_self == kind_sub) {
10078 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010079 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010080 data_sub,
10081 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010082 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010083 }
Martin Pantere26da7c2016-06-02 10:07:09 +000010084 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010085 else {
10086 /* We do not need to compare 0 and len(substring)-1 because
10087 the if statement above ensured already that they are equal
10088 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010089 for (i = 1; i < end_sub; ++i) {
10090 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
10091 PyUnicode_READ(kind_sub, data_sub, i))
10092 return 0;
10093 }
Benjamin Peterson29060642009-01-31 22:14:21 +000010094 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010095 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010096 }
10097
10098 return 0;
10099}
10100
Alexander Belopolsky40018472011-02-26 01:02:56 +000010101Py_ssize_t
10102PyUnicode_Tailmatch(PyObject *str,
10103 PyObject *substr,
10104 Py_ssize_t start,
10105 Py_ssize_t end,
10106 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010107{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010108 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010109 return -1;
Tim Petersced69f82003-09-16 20:30:58 +000010110
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010111 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010112}
10113
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010114static PyObject *
10115ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010116{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010117 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010118 const char *data = PyUnicode_DATA(self);
10119 char *resdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010120 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +000010121
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010122 res = PyUnicode_New(len, 127);
10123 if (res == NULL)
10124 return NULL;
10125 resdata = PyUnicode_DATA(res);
10126 if (lower)
10127 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010129 _Py_bytes_upper(resdata, data, len);
10130 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010131}
10132
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010133static Py_UCS4
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010134handle_capital_sigma(int kind, const void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010135{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010136 Py_ssize_t j;
10137 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010010138 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010139 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +000010140
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010141 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
10142
10143 where ! is a negation and \p{xxx} is a character with property xxx.
10144 */
10145 for (j = i - 1; j >= 0; j--) {
10146 c = PyUnicode_READ(kind, data, j);
10147 if (!_PyUnicode_IsCaseIgnorable(c))
10148 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010149 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010150 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
10151 if (final_sigma) {
10152 for (j = i + 1; j < length; j++) {
10153 c = PyUnicode_READ(kind, data, j);
10154 if (!_PyUnicode_IsCaseIgnorable(c))
10155 break;
10156 }
10157 final_sigma = j == length || !_PyUnicode_IsCased(c);
10158 }
10159 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010160}
10161
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010162static int
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010163lower_ucs4(int kind, const void *data, Py_ssize_t length, Py_ssize_t i,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010164 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010165{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010166 /* Obscure special case. */
10167 if (c == 0x3A3) {
10168 mapped[0] = handle_capital_sigma(kind, data, length, i);
10169 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010170 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010171 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010172}
10173
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010174static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010175do_capitalize(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010176{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010177 Py_ssize_t i, k = 0;
10178 int n_res, j;
10179 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +000010180
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010181 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +010010182 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010183 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010184 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010185 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +000010186 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010187 for (i = 1; i < length; i++) {
10188 c = PyUnicode_READ(kind, data, i);
10189 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10190 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010191 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010192 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000010193 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +000010194 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010195 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010196}
10197
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010198static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010199do_swapcase(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010200 Py_ssize_t i, k = 0;
10201
10202 for (i = 0; i < length; i++) {
10203 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10204 int n_res, j;
10205 if (Py_UNICODE_ISUPPER(c)) {
10206 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10207 }
10208 else if (Py_UNICODE_ISLOWER(c)) {
10209 n_res = _PyUnicode_ToUpperFull(c, mapped);
10210 }
10211 else {
10212 n_res = 1;
10213 mapped[0] = c;
10214 }
10215 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010216 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010217 res[k++] = mapped[j];
10218 }
10219 }
10220 return k;
10221}
10222
10223static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010224do_upper_or_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res,
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010225 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010226{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010227 Py_ssize_t i, k = 0;
10228
10229 for (i = 0; i < length; i++) {
10230 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
10231 int n_res, j;
10232 if (lower)
10233 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10234 else
10235 n_res = _PyUnicode_ToUpperFull(c, mapped);
10236 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010237 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010238 res[k++] = mapped[j];
10239 }
10240 }
10241 return k;
10242}
10243
10244static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010245do_upper(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010246{
10247 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
10248}
10249
10250static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010251do_lower(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010252{
10253 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
10254}
10255
Benjamin Petersone51757f2012-01-12 21:10:29 -050010256static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010257do_casefold(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersond5890c82012-01-14 13:23:30 -050010258{
10259 Py_ssize_t i, k = 0;
10260
10261 for (i = 0; i < length; i++) {
10262 Py_UCS4 c = PyUnicode_READ(kind, data, i);
10263 Py_UCS4 mapped[3];
10264 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
10265 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010266 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010267 res[k++] = mapped[j];
10268 }
10269 }
10270 return k;
10271}
10272
10273static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010274do_title(int kind, const void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Benjamin Petersone51757f2012-01-12 21:10:29 -050010275{
10276 Py_ssize_t i, k = 0;
10277 int previous_is_cased;
10278
10279 previous_is_cased = 0;
10280 for (i = 0; i < length; i++) {
10281 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
10282 Py_UCS4 mapped[3];
10283 int n_res, j;
10284
10285 if (previous_is_cased)
10286 n_res = lower_ucs4(kind, data, length, i, c, mapped);
10287 else
10288 n_res = _PyUnicode_ToTitleFull(c, mapped);
10289
10290 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -070010291 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -050010292 res[k++] = mapped[j];
10293 }
10294
10295 previous_is_cased = _PyUnicode_IsCased(c);
10296 }
10297 return k;
10298}
10299
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010300static PyObject *
10301case_operation(PyObject *self,
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010302 Py_ssize_t (*perform)(int, const void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010303{
10304 PyObject *res = NULL;
10305 Py_ssize_t length, newlength = 0;
10306 int kind, outkind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010307 const void *data;
10308 void *outdata;
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010309 Py_UCS4 maxchar = 0, *tmp, *tmpend;
10310
Benjamin Petersoneea48462012-01-16 14:28:50 -050010311 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010312
10313 kind = PyUnicode_KIND(self);
10314 data = PyUnicode_DATA(self);
10315 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010316 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010317 PyErr_SetString(PyExc_OverflowError, "string is too long");
10318 return NULL;
10319 }
Victor Stinner00d7abd2020-12-01 09:56:42 +010010320 tmp = PyMem_Malloc(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010321 if (tmp == NULL)
10322 return PyErr_NoMemory();
10323 newlength = perform(kind, data, length, tmp, &maxchar);
10324 res = PyUnicode_New(newlength, maxchar);
10325 if (res == NULL)
10326 goto leave;
10327 tmpend = tmp + newlength;
10328 outdata = PyUnicode_DATA(res);
10329 outkind = PyUnicode_KIND(res);
10330 switch (outkind) {
10331 case PyUnicode_1BYTE_KIND:
10332 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10333 break;
10334 case PyUnicode_2BYTE_KIND:
10335 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10336 break;
10337 case PyUnicode_4BYTE_KIND:
10338 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10339 break;
10340 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010341 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010342 }
10343 leave:
Victor Stinner00d7abd2020-12-01 09:56:42 +010010344 PyMem_Free(tmp);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010345 return res;
10346}
10347
Tim Peters8ce9f162004-08-27 01:49:32 +000010348PyObject *
10349PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010350{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010351 PyObject *res;
10352 PyObject *fseq;
10353 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010354 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010355
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010356 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010357 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010358 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010359 }
10360
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010361 /* NOTE: the following code can't call back into Python code,
10362 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010363 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010364
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010365 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010366 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010367 res = _PyUnicode_JoinArray(separator, items, seqlen);
10368 Py_DECREF(fseq);
10369 return res;
10370}
10371
10372PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010373_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010374{
10375 PyObject *res = NULL; /* the result */
10376 PyObject *sep = NULL;
10377 Py_ssize_t seplen;
10378 PyObject *item;
10379 Py_ssize_t sz, i, res_offset;
10380 Py_UCS4 maxchar;
10381 Py_UCS4 item_maxchar;
10382 int use_memcpy;
10383 unsigned char *res_data = NULL, *sep_data = NULL;
10384 PyObject *last_obj;
10385 unsigned int kind = 0;
10386
Tim Peters05eba1f2004-08-27 21:32:02 +000010387 /* If empty sequence, return u"". */
10388 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010389 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010390 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010391
Tim Peters05eba1f2004-08-27 21:32:02 +000010392 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010393 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010394 if (seqlen == 1) {
10395 if (PyUnicode_CheckExact(items[0])) {
10396 res = items[0];
10397 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010398 return res;
10399 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010400 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010401 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010402 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010403 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010404 /* Set up sep and seplen */
10405 if (separator == NULL) {
10406 /* fall back to a blank space separator */
10407 sep = PyUnicode_FromOrdinal(' ');
10408 if (!sep)
10409 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010410 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010411 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010412 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010413 else {
10414 if (!PyUnicode_Check(separator)) {
10415 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010416 "separator: expected str instance,"
10417 " %.80s found",
10418 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010419 goto onError;
10420 }
10421 if (PyUnicode_READY(separator))
10422 goto onError;
10423 sep = separator;
10424 seplen = PyUnicode_GET_LENGTH(separator);
10425 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10426 /* inc refcount to keep this code path symmetric with the
10427 above case of a blank separator */
10428 Py_INCREF(sep);
10429 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010430 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010431 }
10432
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010433 /* There are at least two things to join, or else we have a subclass
10434 * of str in the sequence.
10435 * Do a pre-pass to figure out the total amount of space we'll
10436 * need (sz), and see whether all argument are strings.
10437 */
10438 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010439#ifdef Py_DEBUG
10440 use_memcpy = 0;
10441#else
10442 use_memcpy = 1;
10443#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010444 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010445 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010446 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010447 if (!PyUnicode_Check(item)) {
10448 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010449 "sequence item %zd: expected str instance,"
10450 " %.80s found",
10451 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010452 goto onError;
10453 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 if (PyUnicode_READY(item) == -1)
10455 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010456 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010457 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010458 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010459 if (i != 0) {
10460 add_sz += seplen;
10461 }
10462 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010463 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010464 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010465 goto onError;
10466 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010467 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010468 if (use_memcpy && last_obj != NULL) {
10469 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10470 use_memcpy = 0;
10471 }
10472 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010473 }
Tim Petersced69f82003-09-16 20:30:58 +000010474
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010475 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010476 if (res == NULL)
10477 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010478
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010479 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010480#ifdef Py_DEBUG
10481 use_memcpy = 0;
10482#else
10483 if (use_memcpy) {
10484 res_data = PyUnicode_1BYTE_DATA(res);
10485 kind = PyUnicode_KIND(res);
10486 if (seplen != 0)
10487 sep_data = PyUnicode_1BYTE_DATA(sep);
10488 }
10489#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010490 if (use_memcpy) {
10491 for (i = 0; i < seqlen; ++i) {
10492 Py_ssize_t itemlen;
10493 item = items[i];
10494
10495 /* Copy item, and maybe the separator. */
10496 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010497 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010498 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010499 kind * seplen);
10500 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010501 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010502
10503 itemlen = PyUnicode_GET_LENGTH(item);
10504 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010505 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010506 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010507 kind * itemlen);
10508 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010509 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010510 }
10511 assert(res_data == PyUnicode_1BYTE_DATA(res)
10512 + kind * PyUnicode_GET_LENGTH(res));
10513 }
10514 else {
10515 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10516 Py_ssize_t itemlen;
10517 item = items[i];
10518
10519 /* Copy item, and maybe the separator. */
10520 if (i && seplen != 0) {
10521 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10522 res_offset += seplen;
10523 }
10524
10525 itemlen = PyUnicode_GET_LENGTH(item);
10526 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010527 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010528 res_offset += itemlen;
10529 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010530 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010531 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010532 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010533
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010534 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010535 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010536 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010537
Benjamin Peterson29060642009-01-31 22:14:21 +000010538 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010539 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010540 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010541 return NULL;
10542}
10543
Victor Stinnerd3f08822012-05-29 12:57:52 +020010544void
10545_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10546 Py_UCS4 fill_char)
10547{
10548 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010549 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010550 assert(PyUnicode_IS_READY(unicode));
10551 assert(unicode_modifiable(unicode));
10552 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10553 assert(start >= 0);
10554 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010555 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010556}
10557
Victor Stinner3fe55312012-01-04 00:33:50 +010010558Py_ssize_t
10559PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10560 Py_UCS4 fill_char)
10561{
10562 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010563
10564 if (!PyUnicode_Check(unicode)) {
10565 PyErr_BadInternalCall();
10566 return -1;
10567 }
10568 if (PyUnicode_READY(unicode) == -1)
10569 return -1;
10570 if (unicode_check_modifiable(unicode))
10571 return -1;
10572
Victor Stinnerd3f08822012-05-29 12:57:52 +020010573 if (start < 0) {
10574 PyErr_SetString(PyExc_IndexError, "string index out of range");
10575 return -1;
10576 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010577 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10578 PyErr_SetString(PyExc_ValueError,
10579 "fill character is bigger than "
10580 "the string maximum character");
10581 return -1;
10582 }
10583
10584 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10585 length = Py_MIN(maxlen, length);
10586 if (length <= 0)
10587 return 0;
10588
Victor Stinnerd3f08822012-05-29 12:57:52 +020010589 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010590 return length;
10591}
10592
Victor Stinner9310abb2011-10-05 00:59:23 +020010593static PyObject *
10594pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010595 Py_ssize_t left,
10596 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010598{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 PyObject *u;
10600 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010601 int kind;
10602 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010603
10604 if (left < 0)
10605 left = 0;
10606 if (right < 0)
10607 right = 0;
10608
Victor Stinnerc4b49542011-12-11 22:44:26 +010010609 if (left == 0 && right == 0)
10610 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10613 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010614 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10615 return NULL;
10616 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010618 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010620 if (!u)
10621 return NULL;
10622
10623 kind = PyUnicode_KIND(u);
10624 data = PyUnicode_DATA(u);
10625 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010626 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010627 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010628 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010629 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010630 assert(_PyUnicode_CheckConsistency(u, 1));
10631 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010632}
10633
Alexander Belopolsky40018472011-02-26 01:02:56 +000010634PyObject *
10635PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010636{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010637 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010638
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010639 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010640 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010641
Benjamin Petersonead6b532011-12-20 17:23:42 -060010642 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010643 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010644 if (PyUnicode_IS_ASCII(string))
10645 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010646 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010647 PyUnicode_GET_LENGTH(string), keepends);
10648 else
10649 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010650 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010651 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 break;
10653 case PyUnicode_2BYTE_KIND:
10654 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010655 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 PyUnicode_GET_LENGTH(string), keepends);
10657 break;
10658 case PyUnicode_4BYTE_KIND:
10659 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010660 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010661 PyUnicode_GET_LENGTH(string), keepends);
10662 break;
10663 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010664 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010666 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010667}
10668
Alexander Belopolsky40018472011-02-26 01:02:56 +000010669static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010670split(PyObject *self,
10671 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010672 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010673{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010674 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010675 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010676 Py_ssize_t len1, len2;
10677 PyObject* out;
10678
Guido van Rossumd57fd912000-03-10 22:53:23 +000010679 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010680 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010681
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010682 if (PyUnicode_READY(self) == -1)
10683 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010684
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010686 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010688 if (PyUnicode_IS_ASCII(self))
10689 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010690 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010691 PyUnicode_GET_LENGTH(self), maxcount
10692 );
10693 else
10694 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010695 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010696 PyUnicode_GET_LENGTH(self), maxcount
10697 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010698 case PyUnicode_2BYTE_KIND:
10699 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010700 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010701 PyUnicode_GET_LENGTH(self), maxcount
10702 );
10703 case PyUnicode_4BYTE_KIND:
10704 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010705 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010706 PyUnicode_GET_LENGTH(self), maxcount
10707 );
10708 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010709 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010710 }
10711
10712 if (PyUnicode_READY(substring) == -1)
10713 return NULL;
10714
10715 kind1 = PyUnicode_KIND(self);
10716 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010717 len1 = PyUnicode_GET_LENGTH(self);
10718 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010719 if (kind1 < kind2 || len1 < len2) {
10720 out = PyList_New(1);
10721 if (out == NULL)
10722 return NULL;
10723 Py_INCREF(self);
10724 PyList_SET_ITEM(out, 0, self);
10725 return out;
10726 }
10727 buf1 = PyUnicode_DATA(self);
10728 buf2 = PyUnicode_DATA(substring);
10729 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010730 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010731 if (!buf2)
10732 return NULL;
10733 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010734
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010735 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010736 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010737 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10738 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010739 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010740 else
10741 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010742 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010743 break;
10744 case PyUnicode_2BYTE_KIND:
10745 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010746 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010747 break;
10748 case PyUnicode_4BYTE_KIND:
10749 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010750 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010751 break;
10752 default:
10753 out = NULL;
10754 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010755 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010756 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010757 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010758 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010759}
10760
Alexander Belopolsky40018472011-02-26 01:02:56 +000010761static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010762rsplit(PyObject *self,
10763 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010764 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010765{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010766 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010767 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010768 Py_ssize_t len1, len2;
10769 PyObject* out;
10770
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010771 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010772 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010773
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010774 if (PyUnicode_READY(self) == -1)
10775 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010777 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010778 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010779 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010780 if (PyUnicode_IS_ASCII(self))
10781 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010782 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010783 PyUnicode_GET_LENGTH(self), maxcount
10784 );
10785 else
10786 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010787 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010788 PyUnicode_GET_LENGTH(self), maxcount
10789 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010790 case PyUnicode_2BYTE_KIND:
10791 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010792 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010793 PyUnicode_GET_LENGTH(self), maxcount
10794 );
10795 case PyUnicode_4BYTE_KIND:
10796 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010797 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010798 PyUnicode_GET_LENGTH(self), maxcount
10799 );
10800 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010801 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010802 }
10803
10804 if (PyUnicode_READY(substring) == -1)
10805 return NULL;
10806
10807 kind1 = PyUnicode_KIND(self);
10808 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010809 len1 = PyUnicode_GET_LENGTH(self);
10810 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010811 if (kind1 < kind2 || len1 < len2) {
10812 out = PyList_New(1);
10813 if (out == NULL)
10814 return NULL;
10815 Py_INCREF(self);
10816 PyList_SET_ITEM(out, 0, self);
10817 return out;
10818 }
10819 buf1 = PyUnicode_DATA(self);
10820 buf2 = PyUnicode_DATA(substring);
10821 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010822 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010823 if (!buf2)
10824 return NULL;
10825 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010826
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010827 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010828 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010829 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10830 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010831 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010832 else
10833 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010834 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010835 break;
10836 case PyUnicode_2BYTE_KIND:
10837 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010838 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010839 break;
10840 case PyUnicode_4BYTE_KIND:
10841 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010842 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010843 break;
10844 default:
10845 out = NULL;
10846 }
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010847 assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010848 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010849 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010850 return out;
10851}
10852
10853static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010854anylib_find(int kind, PyObject *str1, const void *buf1, Py_ssize_t len1,
10855 PyObject *str2, const void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010856{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010857 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010858 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010859 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10860 return asciilib_find(buf1, len1, buf2, len2, offset);
10861 else
10862 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010863 case PyUnicode_2BYTE_KIND:
10864 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10865 case PyUnicode_4BYTE_KIND:
10866 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10867 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010868 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010869}
10870
10871static Py_ssize_t
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010872anylib_count(int kind, PyObject *sstr, const void* sbuf, Py_ssize_t slen,
10873 PyObject *str1, const void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010874{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010875 switch (kind) {
10876 case PyUnicode_1BYTE_KIND:
10877 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10878 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10879 else
10880 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10881 case PyUnicode_2BYTE_KIND:
10882 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10883 case PyUnicode_4BYTE_KIND:
10884 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10885 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010886 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010887}
10888
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010889static void
10890replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10891 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10892{
10893 int kind = PyUnicode_KIND(u);
10894 void *data = PyUnicode_DATA(u);
10895 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10896 if (kind == PyUnicode_1BYTE_KIND) {
10897 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10898 (Py_UCS1 *)data + len,
10899 u1, u2, maxcount);
10900 }
10901 else if (kind == PyUnicode_2BYTE_KIND) {
10902 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10903 (Py_UCS2 *)data + len,
10904 u1, u2, maxcount);
10905 }
10906 else {
10907 assert(kind == PyUnicode_4BYTE_KIND);
10908 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10909 (Py_UCS4 *)data + len,
10910 u1, u2, maxcount);
10911 }
10912}
10913
Alexander Belopolsky40018472011-02-26 01:02:56 +000010914static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010915replace(PyObject *self, PyObject *str1,
10916 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010917{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010918 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010919 const char *sbuf = PyUnicode_DATA(self);
10920 const void *buf1 = PyUnicode_DATA(str1);
10921 const void *buf2 = PyUnicode_DATA(str2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010922 int srelease = 0, release1 = 0, release2 = 0;
10923 int skind = PyUnicode_KIND(self);
10924 int kind1 = PyUnicode_KIND(str1);
10925 int kind2 = PyUnicode_KIND(str2);
10926 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10927 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10928 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010929 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010930 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010931
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010932 if (slen < len1)
10933 goto nothing;
10934
Guido van Rossumd57fd912000-03-10 22:53:23 +000010935 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010936 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010937 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010938 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010939
Victor Stinner59de0ee2011-10-07 10:01:28 +020010940 if (str1 == str2)
10941 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010942
Victor Stinner49a0a212011-10-12 23:46:10 +020010943 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010944 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10945 if (maxchar < maxchar_str1)
10946 /* substring too wide to be present */
10947 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010948 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10949 /* Replacing str1 with str2 may cause a maxchar reduction in the
10950 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010951 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010952 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010954 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010955 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010956 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010957 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010958 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010959 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010960 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010961 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010962
Victor Stinner69ed0f42013-04-09 21:48:24 +020010963 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010964 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010965 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010966 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010967 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010968 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010969 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010970 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010971
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010972 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10973 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010974 }
10975 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010976 int rkind = skind;
10977 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010978 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010979
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010980 if (kind1 < rkind) {
10981 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010982 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010983 if (!buf1) goto error;
10984 release1 = 1;
10985 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010986 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010987 if (i < 0)
10988 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010989 if (rkind > kind2) {
10990 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010991 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010992 if (!buf2) goto error;
10993 release2 = 1;
10994 }
10995 else if (rkind < kind2) {
10996 /* widen self and buf1 */
10997 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010998 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030010999 assert(buf1 != PyUnicode_DATA(str1));
11000 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011001 buf1 = PyUnicode_DATA(str1);
11002 release1 = 0;
11003 }
11004 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005 if (!sbuf) goto error;
11006 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011007 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011008 if (!buf1) goto error;
11009 release1 = 1;
11010 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011011 u = PyUnicode_New(slen, maxchar);
11012 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011013 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020011014 assert(PyUnicode_KIND(u) == rkind);
11015 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020011016
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011017 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000011018 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011019 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011020 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011021 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011022 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000011023
11024 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020011025 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011026 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020011027 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000011028 if (i == -1)
11029 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011030 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011031 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011032 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011033 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000011034 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011035 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011036 }
11037 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011038 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010011039 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011040 int rkind = skind;
11041 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011042
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011043 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020011044 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011045 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011046 if (!buf1) goto error;
11047 release1 = 1;
11048 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020011049 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011050 if (n == 0)
11051 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011052 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020011053 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011054 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011055 if (!buf2) goto error;
11056 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011057 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011058 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020011059 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011060 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011061 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011062 if (!sbuf) goto error;
11063 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011064 if (release1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011065 assert(buf1 != PyUnicode_DATA(str1));
11066 PyMem_Free((void *)buf1);
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011067 buf1 = PyUnicode_DATA(str1);
11068 release1 = 0;
11069 }
11070 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011071 if (!buf1) goto error;
11072 release1 = 1;
11073 }
11074 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
Łukasz Langa8c1e1da2021-09-22 01:33:59 +020011075 PyUnicode_GET_LENGTH(str1)); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011076 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011077 PyErr_SetString(PyExc_OverflowError,
11078 "replace string is too long");
11079 goto error;
11080 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010011081 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020011082 if (new_size == 0) {
Victor Stinner90ed8a62020-06-24 00:34:07 +020011083 u = unicode_new_empty();
Victor Stinner49a0a212011-10-12 23:46:10 +020011084 goto done;
11085 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080011086 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011087 PyErr_SetString(PyExc_OverflowError,
11088 "replace string is too long");
11089 goto error;
11090 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011091 u = PyUnicode_New(new_size, maxchar);
11092 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011093 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020011094 assert(PyUnicode_KIND(u) == rkind);
11095 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011096 ires = i = 0;
11097 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011098 while (n-- > 0) {
11099 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020011100 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011101 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020011102 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000011103 if (j == -1)
11104 break;
11105 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011106 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011107 memcpy(res + rkind * ires,
11108 sbuf + rkind * i,
11109 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011110 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011111 }
11112 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011113 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011114 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011115 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011116 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011117 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011118 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011119 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011120 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011121 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011122 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011123 memcpy(res + rkind * ires,
11124 sbuf + rkind * i,
11125 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020011126 }
11127 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000011128 /* interleave */
11129 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011130 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011131 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011132 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011133 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011134 if (--n <= 0)
11135 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011136 memcpy(res + rkind * ires,
11137 sbuf + rkind * i,
11138 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011139 ires++;
11140 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011141 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020011142 memcpy(res + rkind * ires,
11143 sbuf + rkind * i,
11144 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000011145 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011146 }
11147
11148 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020011149 unicode_adjust_maxchar(&u);
11150 if (u == NULL)
11151 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011152 }
Victor Stinner49a0a212011-10-12 23:46:10 +020011153
11154 done:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011155 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11156 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11157 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011158 if (srelease)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011159 PyMem_Free((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011160 if (release1)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011161 PyMem_Free((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011162 if (release2)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011163 PyMem_Free((void *)buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020011164 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011165 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011166
Benjamin Peterson29060642009-01-31 22:14:21 +000011167 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000011168 /* nothing to replace; return original string (when possible) */
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011169 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11170 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11171 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011172 if (srelease)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011173 PyMem_Free((void *)sbuf);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011174 if (release1)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011175 PyMem_Free((void *)buf1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011176 if (release2)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011177 PyMem_Free((void *)buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010011178 return unicode_result_unchanged(self);
11179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011180 error:
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011181 assert(srelease == (sbuf != PyUnicode_DATA(self)));
11182 assert(release1 == (buf1 != PyUnicode_DATA(str1)));
11183 assert(release2 == (buf2 != PyUnicode_DATA(str2)));
11184 if (srelease)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011185 PyMem_Free((void *)sbuf);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011186 if (release1)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011187 PyMem_Free((void *)buf1);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011188 if (release2)
Victor Stinner00d7abd2020-12-01 09:56:42 +010011189 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011190 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011191}
11192
11193/* --- Unicode Object Methods --------------------------------------------- */
11194
INADA Naoki3ae20562017-01-16 20:41:20 +090011195/*[clinic input]
11196str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000011197
INADA Naoki3ae20562017-01-16 20:41:20 +090011198Return a version of the string where each word is titlecased.
11199
11200More specifically, words start with uppercased characters and all remaining
11201cased characters have lower case.
11202[clinic start generated code]*/
11203
11204static PyObject *
11205unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011206/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011207{
Benjamin Petersoneea48462012-01-16 14:28:50 -050011208 if (PyUnicode_READY(self) == -1)
11209 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011210 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011211}
11212
INADA Naoki3ae20562017-01-16 20:41:20 +090011213/*[clinic input]
11214str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000011215
INADA Naoki3ae20562017-01-16 20:41:20 +090011216Return a capitalized version of the string.
11217
11218More specifically, make the first character have upper case and the rest lower
11219case.
11220[clinic start generated code]*/
11221
11222static PyObject *
11223unicode_capitalize_impl(PyObject *self)
11224/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011225{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050011226 if (PyUnicode_READY(self) == -1)
11227 return NULL;
11228 if (PyUnicode_GET_LENGTH(self) == 0)
11229 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011230 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011231}
11232
INADA Naoki3ae20562017-01-16 20:41:20 +090011233/*[clinic input]
11234str.casefold as unicode_casefold
11235
11236Return a version of the string suitable for caseless comparisons.
11237[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011238
11239static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011240unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011241/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050011242{
11243 if (PyUnicode_READY(self) == -1)
11244 return NULL;
11245 if (PyUnicode_IS_ASCII(self))
11246 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010011247 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050011248}
11249
11250
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011251/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011252
11253static int
11254convert_uc(PyObject *obj, void *addr)
11255{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011256 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011257
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011258 if (!PyUnicode_Check(obj)) {
11259 PyErr_Format(PyExc_TypeError,
11260 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020011261 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011262 return 0;
11263 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011264 if (PyUnicode_READY(obj) < 0)
11265 return 0;
11266 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000011267 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000011268 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000011269 return 0;
11270 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011271 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000011272 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000011273}
11274
INADA Naoki3ae20562017-01-16 20:41:20 +090011275/*[clinic input]
11276str.center as unicode_center
11277
11278 width: Py_ssize_t
11279 fillchar: Py_UCS4 = ' '
11280 /
11281
11282Return a centered string of length width.
11283
11284Padding is done using the specified fill character (default is a space).
11285[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011286
11287static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011288unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
11289/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011290{
Martin v. Löwis18e16552006-02-15 17:27:45 +000011291 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011292
Benjamin Petersonbac79492012-01-14 13:34:47 -050011293 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011294 return NULL;
11295
Victor Stinnerc4b49542011-12-11 22:44:26 +010011296 if (PyUnicode_GET_LENGTH(self) >= width)
11297 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011298
Victor Stinnerc4b49542011-12-11 22:44:26 +010011299 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011300 left = marg / 2 + (marg & width & 1);
11301
Victor Stinner9310abb2011-10-05 00:59:23 +020011302 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011303}
11304
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011305/* This function assumes that str1 and str2 are readied by the caller. */
11306
Marc-André Lemburge5034372000-08-08 08:04:29 +000011307static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011308unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000011309{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011310#define COMPARE(TYPE1, TYPE2) \
11311 do { \
11312 TYPE1* p1 = (TYPE1 *)data1; \
11313 TYPE2* p2 = (TYPE2 *)data2; \
11314 TYPE1* end = p1 + len; \
11315 Py_UCS4 c1, c2; \
11316 for (; p1 != end; p1++, p2++) { \
11317 c1 = *p1; \
11318 c2 = *p2; \
11319 if (c1 != c2) \
11320 return (c1 < c2) ? -1 : 1; \
11321 } \
11322 } \
11323 while (0)
11324
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011325 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011326 const void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011327 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011328
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011329 kind1 = PyUnicode_KIND(str1);
11330 kind2 = PyUnicode_KIND(str2);
11331 data1 = PyUnicode_DATA(str1);
11332 data2 = PyUnicode_DATA(str2);
11333 len1 = PyUnicode_GET_LENGTH(str1);
11334 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011335 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011336
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011337 switch(kind1) {
11338 case PyUnicode_1BYTE_KIND:
11339 {
11340 switch(kind2) {
11341 case PyUnicode_1BYTE_KIND:
11342 {
11343 int cmp = memcmp(data1, data2, len);
11344 /* normalize result of memcmp() into the range [-1; 1] */
11345 if (cmp < 0)
11346 return -1;
11347 if (cmp > 0)
11348 return 1;
11349 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011350 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011351 case PyUnicode_2BYTE_KIND:
11352 COMPARE(Py_UCS1, Py_UCS2);
11353 break;
11354 case PyUnicode_4BYTE_KIND:
11355 COMPARE(Py_UCS1, Py_UCS4);
11356 break;
11357 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011358 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011359 }
11360 break;
11361 }
11362 case PyUnicode_2BYTE_KIND:
11363 {
11364 switch(kind2) {
11365 case PyUnicode_1BYTE_KIND:
11366 COMPARE(Py_UCS2, Py_UCS1);
11367 break;
11368 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011369 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011370 COMPARE(Py_UCS2, Py_UCS2);
11371 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011372 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011373 case PyUnicode_4BYTE_KIND:
11374 COMPARE(Py_UCS2, Py_UCS4);
11375 break;
11376 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011377 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011378 }
11379 break;
11380 }
11381 case PyUnicode_4BYTE_KIND:
11382 {
11383 switch(kind2) {
11384 case PyUnicode_1BYTE_KIND:
11385 COMPARE(Py_UCS4, Py_UCS1);
11386 break;
11387 case PyUnicode_2BYTE_KIND:
11388 COMPARE(Py_UCS4, Py_UCS2);
11389 break;
11390 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011391 {
11392#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11393 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11394 /* normalize result of wmemcmp() into the range [-1; 1] */
11395 if (cmp < 0)
11396 return -1;
11397 if (cmp > 0)
11398 return 1;
11399#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011400 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011401#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011402 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011403 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011404 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011405 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011406 }
11407 break;
11408 }
11409 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011410 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011411 }
11412
Victor Stinner770e19e2012-10-04 22:59:45 +020011413 if (len1 == len2)
11414 return 0;
11415 if (len1 < len2)
11416 return -1;
11417 else
11418 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011419
11420#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011421}
11422
Benjamin Peterson621b4302016-09-09 13:54:34 -070011423static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011424unicode_compare_eq(PyObject *str1, PyObject *str2)
11425{
11426 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011427 const void *data1, *data2;
Victor Stinnere5567ad2012-10-23 02:48:49 +020011428 Py_ssize_t len;
11429 int cmp;
11430
Victor Stinnere5567ad2012-10-23 02:48:49 +020011431 len = PyUnicode_GET_LENGTH(str1);
11432 if (PyUnicode_GET_LENGTH(str2) != len)
11433 return 0;
11434 kind = PyUnicode_KIND(str1);
11435 if (PyUnicode_KIND(str2) != kind)
11436 return 0;
11437 data1 = PyUnicode_DATA(str1);
11438 data2 = PyUnicode_DATA(str2);
11439
11440 cmp = memcmp(data1, data2, len * kind);
11441 return (cmp == 0);
11442}
11443
11444
Alexander Belopolsky40018472011-02-26 01:02:56 +000011445int
11446PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011447{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011448 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11449 if (PyUnicode_READY(left) == -1 ||
11450 PyUnicode_READY(right) == -1)
11451 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011452
11453 /* a string is equal to itself */
11454 if (left == right)
11455 return 0;
11456
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011457 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011459 PyErr_Format(PyExc_TypeError,
11460 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011461 Py_TYPE(left)->tp_name,
11462 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011463 return -1;
11464}
11465
Martin v. Löwis5b222132007-06-10 09:51:05 +000011466int
11467PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11468{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011469 Py_ssize_t i;
11470 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011471 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011472 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011473
Victor Stinner910337b2011-10-03 03:20:16 +020011474 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011475 if (!PyUnicode_IS_READY(uni)) {
11476 const wchar_t *ws = _PyUnicode_WSTR(uni);
11477 /* Compare Unicode string and source character set string */
11478 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11479 if (chr != ustr[i])
11480 return (chr < ustr[i]) ? -1 : 1;
11481 }
11482 /* This check keeps Python strings that end in '\0' from comparing equal
11483 to C strings identical up to that point. */
11484 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11485 return 1; /* uni is longer */
11486 if (ustr[i])
11487 return -1; /* str is longer */
11488 return 0;
11489 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011490 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011491 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011492 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011493 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011494 size_t len, len2 = strlen(str);
11495 int cmp;
11496
11497 len = Py_MIN(len1, len2);
11498 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011499 if (cmp != 0) {
11500 if (cmp < 0)
11501 return -1;
11502 else
11503 return 1;
11504 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011505 if (len1 > len2)
11506 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011507 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011508 return -1; /* str is longer */
11509 return 0;
11510 }
11511 else {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011512 const void *data = PyUnicode_DATA(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011513 /* Compare Unicode string and source character set string */
11514 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011515 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011516 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11517 /* This check keeps Python strings that end in '\0' from comparing equal
11518 to C strings identical up to that point. */
11519 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11520 return 1; /* uni is longer */
11521 if (str[i])
11522 return -1; /* str is longer */
11523 return 0;
11524 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011525}
11526
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011527static int
11528non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11529{
11530 size_t i, len;
11531 const wchar_t *p;
11532 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11533 if (strlen(str) != len)
11534 return 0;
11535 p = _PyUnicode_WSTR(unicode);
11536 assert(p);
11537 for (i = 0; i < len; i++) {
11538 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011539 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011540 return 0;
11541 }
11542 return 1;
11543}
11544
11545int
11546_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11547{
11548 size_t len;
11549 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011550 assert(str);
11551#ifndef NDEBUG
11552 for (const char *p = str; *p; p++) {
11553 assert((unsigned char)*p < 128);
11554 }
11555#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011556 if (PyUnicode_READY(unicode) == -1) {
11557 /* Memory error or bad data */
11558 PyErr_Clear();
11559 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11560 }
11561 if (!PyUnicode_IS_ASCII(unicode))
11562 return 0;
11563 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11564 return strlen(str) == len &&
11565 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11566}
11567
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011568int
11569_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11570{
11571 PyObject *right_uni;
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011572
11573 assert(_PyUnicode_CHECK(left));
11574 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011575#ifndef NDEBUG
11576 for (const char *p = right->string; *p; p++) {
11577 assert((unsigned char)*p < 128);
11578 }
11579#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011580
11581 if (PyUnicode_READY(left) == -1) {
11582 /* memory error or bad data */
11583 PyErr_Clear();
11584 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11585 }
11586
11587 if (!PyUnicode_IS_ASCII(left))
11588 return 0;
11589
11590 right_uni = _PyUnicode_FromId(right); /* borrowed */
11591 if (right_uni == NULL) {
11592 /* memory error or bad data */
11593 PyErr_Clear();
11594 return _PyUnicode_EqualToASCIIString(left, right->string);
11595 }
11596
11597 if (left == right_uni)
11598 return 1;
11599
11600 if (PyUnicode_CHECK_INTERNED(left))
11601 return 0;
11602
INADA Naoki7cc95f52018-01-28 02:07:09 +090011603 assert(_PyUnicode_HASH(right_uni) != -1);
Victor Stinner607b1022020-05-05 18:50:30 +020011604 Py_hash_t hash = _PyUnicode_HASH(left);
Victor Stinnerea251802020-12-26 02:58:33 +010011605 if (hash != -1 && hash != _PyUnicode_HASH(right_uni)) {
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011606 return 0;
Victor Stinnerea251802020-12-26 02:58:33 +010011607 }
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011608
11609 return unicode_compare_eq(left, right_uni);
11610}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011611
Alexander Belopolsky40018472011-02-26 01:02:56 +000011612PyObject *
11613PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011614{
11615 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011616
Victor Stinnere5567ad2012-10-23 02:48:49 +020011617 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11618 Py_RETURN_NOTIMPLEMENTED;
11619
11620 if (PyUnicode_READY(left) == -1 ||
11621 PyUnicode_READY(right) == -1)
11622 return NULL;
11623
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011624 if (left == right) {
11625 switch (op) {
11626 case Py_EQ:
11627 case Py_LE:
11628 case Py_GE:
11629 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011630 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011631 case Py_NE:
11632 case Py_LT:
11633 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011634 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011635 default:
11636 PyErr_BadArgument();
11637 return NULL;
11638 }
11639 }
11640 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011641 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011642 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011643 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011644 }
11645 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011646 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011647 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011648 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011649}
11650
Alexander Belopolsky40018472011-02-26 01:02:56 +000011651int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011652_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11653{
11654 return unicode_eq(aa, bb);
11655}
11656
11657int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011658PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011659{
Victor Stinner77282cb2013-04-14 19:22:47 +020011660 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011661 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011662 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011663 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011664
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011665 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011666 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011667 "'in <string>' requires string as left operand, not %.100s",
11668 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011669 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011670 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011671 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011672 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011673 if (ensure_unicode(str) < 0)
11674 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011675
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011676 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011677 kind2 = PyUnicode_KIND(substr);
11678 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011679 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011680 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011681 len2 = PyUnicode_GET_LENGTH(substr);
11682 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011683 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011684 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011685 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011686 if (len2 == 1) {
11687 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11688 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011689 return result;
11690 }
11691 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011692 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011693 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011694 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011695 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011696
Victor Stinner77282cb2013-04-14 19:22:47 +020011697 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011698 case PyUnicode_1BYTE_KIND:
11699 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11700 break;
11701 case PyUnicode_2BYTE_KIND:
11702 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11703 break;
11704 case PyUnicode_4BYTE_KIND:
11705 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11706 break;
11707 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011708 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011709 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011710
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011711 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substr)));
Victor Stinner77282cb2013-04-14 19:22:47 +020011712 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011713 PyMem_Free((void *)buf2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011714
Guido van Rossum403d68b2000-03-13 15:55:09 +000011715 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011716}
11717
Guido van Rossumd57fd912000-03-10 22:53:23 +000011718/* Concat to string or Unicode object giving a new Unicode object. */
11719
Alexander Belopolsky40018472011-02-26 01:02:56 +000011720PyObject *
11721PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011722{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011723 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011724 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011725 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011726
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011727 if (ensure_unicode(left) < 0)
11728 return NULL;
11729
11730 if (!PyUnicode_Check(right)) {
11731 PyErr_Format(PyExc_TypeError,
11732 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011733 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011734 return NULL;
11735 }
11736 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011737 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011738
11739 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011740 PyObject *empty = unicode_get_empty(); // Borrowed reference
11741 if (left == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011742 return PyUnicode_FromObject(right);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011743 }
11744 if (right == empty) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011745 return PyUnicode_FromObject(left);
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011746 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011748 left_len = PyUnicode_GET_LENGTH(left);
11749 right_len = PyUnicode_GET_LENGTH(right);
11750 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011751 PyErr_SetString(PyExc_OverflowError,
11752 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011753 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011754 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011755 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011756
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011757 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11758 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011759 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011760
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011762 result = PyUnicode_New(new_len, maxchar);
11763 if (result == NULL)
11764 return NULL;
11765 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11766 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11767 assert(_PyUnicode_CheckConsistency(result, 1));
11768 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011769}
11770
Walter Dörwald1ab83302007-05-18 17:15:44 +000011771void
Victor Stinner23e56682011-10-03 03:54:37 +020011772PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011773{
Victor Stinner23e56682011-10-03 03:54:37 +020011774 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011775 Py_UCS4 maxchar, maxchar2;
11776 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011777
11778 if (p_left == NULL) {
11779 if (!PyErr_Occurred())
11780 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011781 return;
11782 }
Victor Stinner23e56682011-10-03 03:54:37 +020011783 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011784 if (right == NULL || left == NULL
11785 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011786 if (!PyErr_Occurred())
11787 PyErr_BadInternalCall();
11788 goto error;
11789 }
11790
Benjamin Petersonbac79492012-01-14 13:34:47 -050011791 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011792 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011793 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011794 goto error;
11795
Victor Stinner488fa492011-12-12 00:01:39 +010011796 /* Shortcuts */
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011797 PyObject *empty = unicode_get_empty(); // Borrowed reference
11798 if (left == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011799 Py_DECREF(left);
11800 Py_INCREF(right);
11801 *p_left = right;
11802 return;
11803 }
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011804 if (right == empty) {
Victor Stinner488fa492011-12-12 00:01:39 +010011805 return;
Victor Stinnerf363d0a2020-06-24 00:10:40 +020011806 }
Victor Stinner488fa492011-12-12 00:01:39 +010011807
11808 left_len = PyUnicode_GET_LENGTH(left);
11809 right_len = PyUnicode_GET_LENGTH(right);
11810 if (left_len > PY_SSIZE_T_MAX - right_len) {
11811 PyErr_SetString(PyExc_OverflowError,
11812 "strings are too large to concat");
11813 goto error;
11814 }
11815 new_len = left_len + right_len;
11816
11817 if (unicode_modifiable(left)
11818 && PyUnicode_CheckExact(right)
11819 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011820 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11821 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011822 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011823 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011824 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11825 {
11826 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011827 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011828 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011829
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011830 /* copy 'right' into the newly allocated area of 'left' */
11831 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011832 }
Victor Stinner488fa492011-12-12 00:01:39 +010011833 else {
11834 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11835 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011836 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011837
Victor Stinner488fa492011-12-12 00:01:39 +010011838 /* Concat the two Unicode strings */
11839 res = PyUnicode_New(new_len, maxchar);
11840 if (res == NULL)
11841 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011842 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11843 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011844 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011845 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011846 }
11847 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011848 return;
11849
11850error:
Victor Stinner488fa492011-12-12 00:01:39 +010011851 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011852}
11853
11854void
11855PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11856{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011857 PyUnicode_Append(pleft, right);
11858 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011859}
11860
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011861/*
11862Wraps stringlib_parse_args_finds() and additionally ensures that the
11863first argument is a unicode object.
11864*/
11865
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011866static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011867parse_args_finds_unicode(const char * function_name, PyObject *args,
11868 PyObject **substring,
11869 Py_ssize_t *start, Py_ssize_t *end)
11870{
11871 if(stringlib_parse_args_finds(function_name, args, substring,
11872 start, end)) {
11873 if (ensure_unicode(*substring) < 0)
11874 return 0;
11875 return 1;
11876 }
11877 return 0;
11878}
11879
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011880PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011881 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011882\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011883Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011884string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011885interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011886
11887static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011888unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011890 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011891 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011892 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011894 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011895 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011896 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011897
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011898 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011899 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011900
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011901 kind1 = PyUnicode_KIND(self);
11902 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011903 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011904 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011905
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011906 len1 = PyUnicode_GET_LENGTH(self);
11907 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011908 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011909 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011910 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011911
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011912 buf1 = PyUnicode_DATA(self);
11913 buf2 = PyUnicode_DATA(substring);
11914 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011915 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011916 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011917 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011918 }
11919 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011920 case PyUnicode_1BYTE_KIND:
11921 iresult = ucs1lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011922 ((const Py_UCS1*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011923 buf2, len2, PY_SSIZE_T_MAX
11924 );
11925 break;
11926 case PyUnicode_2BYTE_KIND:
11927 iresult = ucs2lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011928 ((const Py_UCS2*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011929 buf2, len2, PY_SSIZE_T_MAX
11930 );
11931 break;
11932 case PyUnicode_4BYTE_KIND:
11933 iresult = ucs4lib_count(
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011934 ((const Py_UCS4*)buf1) + start, end - start,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011935 buf2, len2, PY_SSIZE_T_MAX
11936 );
11937 break;
11938 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011939 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011940 }
11941
11942 result = PyLong_FromSsize_t(iresult);
11943
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011944 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011945 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011946 PyMem_Free((void *)buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011947
Guido van Rossumd57fd912000-03-10 22:53:23 +000011948 return result;
11949}
11950
INADA Naoki3ae20562017-01-16 20:41:20 +090011951/*[clinic input]
11952str.encode as unicode_encode
11953
11954 encoding: str(c_default="NULL") = 'utf-8'
11955 The encoding in which to encode the string.
11956 errors: str(c_default="NULL") = 'strict'
11957 The error handling scheme to use for encoding errors.
11958 The default is 'strict' meaning that encoding errors raise a
11959 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11960 'xmlcharrefreplace' as well as any other name registered with
11961 codecs.register_error that can handle UnicodeEncodeErrors.
11962
11963Encode the string using the codec registered for encoding.
11964[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011965
11966static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011967unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011968/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011969{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011970 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011971}
11972
INADA Naoki3ae20562017-01-16 20:41:20 +090011973/*[clinic input]
11974str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011975
INADA Naoki3ae20562017-01-16 20:41:20 +090011976 tabsize: int = 8
11977
11978Return a copy where all tab characters are expanded using spaces.
11979
11980If tabsize is not given, a tab size of 8 characters is assumed.
11981[clinic start generated code]*/
11982
11983static PyObject *
11984unicode_expandtabs_impl(PyObject *self, int tabsize)
11985/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011986{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011987 Py_ssize_t i, j, line_pos, src_len, incr;
11988 Py_UCS4 ch;
11989 PyObject *u;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030011990 const void *src_data;
11991 void *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011992 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011993 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011994
Antoine Pitrou22425222011-10-04 19:10:51 +020011995 if (PyUnicode_READY(self) == -1)
11996 return NULL;
11997
Thomas Wouters7e474022000-07-16 12:04:32 +000011998 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011999 src_len = PyUnicode_GET_LENGTH(self);
12000 i = j = line_pos = 0;
12001 kind = PyUnicode_KIND(self);
12002 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020012003 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020012004 for (; i < src_len; i++) {
12005 ch = PyUnicode_READ(kind, src_data, i);
12006 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020012007 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000012008 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020012009 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000012010 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020012011 goto overflow;
12012 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000012013 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000012014 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012015 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012016 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000012017 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020012018 goto overflow;
12019 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012020 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020012021 if (ch == '\n' || ch == '\r')
12022 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012023 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020012024 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010012025 if (!found)
12026 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000012027
Guido van Rossumd57fd912000-03-10 22:53:23 +000012028 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020012029 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012030 if (!u)
12031 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020012032 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012033
Antoine Pitroue71d5742011-10-04 15:55:09 +020012034 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012035
Antoine Pitroue71d5742011-10-04 15:55:09 +020012036 for (; i < src_len; i++) {
12037 ch = PyUnicode_READ(kind, src_data, i);
12038 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000012039 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020012040 incr = tabsize - (line_pos % tabsize);
12041 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010012042 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010012043 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000012044 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012045 }
Benjamin Peterson29060642009-01-31 22:14:21 +000012046 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020012047 line_pos++;
12048 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000012049 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020012050 if (ch == '\n' || ch == '\r')
12051 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012052 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020012053 }
12054 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010012055 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000012056
Antoine Pitroue71d5742011-10-04 15:55:09 +020012057 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000012058 PyErr_SetString(PyExc_OverflowError, "new string is too long");
12059 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012060}
12061
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012062PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012063 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012064\n\
12065Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012066such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012067arguments start and end are interpreted as in slice notation.\n\
12068\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012069Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070
12071static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012072unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012073{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012074 /* initialize variables to prevent gcc warning */
12075 PyObject *substring = NULL;
12076 Py_ssize_t start = 0;
12077 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012078 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012079
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012080 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012082
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012083 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012084 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012085
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012086 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012087
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012088 if (result == -2)
12089 return NULL;
12090
Christian Heimes217cfd12007-12-02 14:31:20 +000012091 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012092}
12093
12094static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020012095unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012096{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012097 const void *data;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012098 enum PyUnicode_Kind kind;
12099 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012100
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030012101 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012102 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012103 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012104 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030012105 if (PyUnicode_READY(self) == -1) {
12106 return NULL;
12107 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020012108 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
12109 PyErr_SetString(PyExc_IndexError, "string index out of range");
12110 return NULL;
12111 }
12112 kind = PyUnicode_KIND(self);
12113 data = PyUnicode_DATA(self);
12114 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010012115 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012116}
12117
Guido van Rossumc2504932007-09-18 19:42:40 +000012118/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010012119 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000012120static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012121unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012122{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080012123 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000012124
Benjamin Petersonf6622c82012-04-09 14:53:07 -040012125#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050012126 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040012127#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012128 if (_PyUnicode_HASH(self) != -1)
12129 return _PyUnicode_HASH(self);
12130 if (PyUnicode_READY(self) == -1)
12131 return -1;
animalizea1d14252019-01-02 20:16:06 +080012132
Christian Heimes985ecdc2013-11-20 11:46:18 +010012133 x = _Py_HashBytes(PyUnicode_DATA(self),
12134 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012135 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000012136 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012137}
12138
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012139PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012140 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012141\n\
oldkaa0735f2018-02-02 16:52:55 +080012142Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012143such that sub is contained within S[start:end]. Optional\n\
12144arguments start and end are interpreted as in slice notation.\n\
12145\n\
12146Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012147
12148static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012149unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012150{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012151 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000012152 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012153 PyObject *substring = NULL;
12154 Py_ssize_t start = 0;
12155 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012156
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012157 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000012158 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012159
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012160 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012161 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012162
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012163 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012165 if (result == -2)
12166 return NULL;
12167
Guido van Rossumd57fd912000-03-10 22:53:23 +000012168 if (result < 0) {
12169 PyErr_SetString(PyExc_ValueError, "substring not found");
12170 return NULL;
12171 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012172
Christian Heimes217cfd12007-12-02 14:31:20 +000012173 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012174}
12175
INADA Naoki3ae20562017-01-16 20:41:20 +090012176/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090012177str.isascii as unicode_isascii
12178
12179Return True if all characters in the string are ASCII, False otherwise.
12180
12181ASCII characters have code points in the range U+0000-U+007F.
12182Empty string is ASCII too.
12183[clinic start generated code]*/
12184
12185static PyObject *
12186unicode_isascii_impl(PyObject *self)
12187/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
12188{
12189 if (PyUnicode_READY(self) == -1) {
12190 return NULL;
12191 }
12192 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
12193}
12194
12195/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090012196str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012197
INADA Naoki3ae20562017-01-16 20:41:20 +090012198Return True if the string is a lowercase string, False otherwise.
12199
12200A string is lowercase if all cased characters in the string are lowercase and
12201there is at least one cased character in the string.
12202[clinic start generated code]*/
12203
12204static PyObject *
12205unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012206/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012207{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012208 Py_ssize_t i, length;
12209 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012210 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012211 int cased;
12212
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012213 if (PyUnicode_READY(self) == -1)
12214 return NULL;
12215 length = PyUnicode_GET_LENGTH(self);
12216 kind = PyUnicode_KIND(self);
12217 data = PyUnicode_DATA(self);
12218
Guido van Rossumd57fd912000-03-10 22:53:23 +000012219 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012220 if (length == 1)
12221 return PyBool_FromLong(
12222 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012223
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012224 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012225 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012226 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012227
Guido van Rossumd57fd912000-03-10 22:53:23 +000012228 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012229 for (i = 0; i < length; i++) {
12230 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012231
Benjamin Peterson29060642009-01-31 22:14:21 +000012232 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012233 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012234 else if (!cased && Py_UNICODE_ISLOWER(ch))
12235 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012236 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012237 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012238}
12239
INADA Naoki3ae20562017-01-16 20:41:20 +090012240/*[clinic input]
12241str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242
INADA Naoki3ae20562017-01-16 20:41:20 +090012243Return True if the string is an uppercase string, False otherwise.
12244
12245A string is uppercase if all cased characters in the string are uppercase and
12246there is at least one cased character in the string.
12247[clinic start generated code]*/
12248
12249static PyObject *
12250unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012251/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012252{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012253 Py_ssize_t i, length;
12254 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012255 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012256 int cased;
12257
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012258 if (PyUnicode_READY(self) == -1)
12259 return NULL;
12260 length = PyUnicode_GET_LENGTH(self);
12261 kind = PyUnicode_KIND(self);
12262 data = PyUnicode_DATA(self);
12263
Guido van Rossumd57fd912000-03-10 22:53:23 +000012264 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012265 if (length == 1)
12266 return PyBool_FromLong(
12267 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012269 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012270 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012271 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012272
Guido van Rossumd57fd912000-03-10 22:53:23 +000012273 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012274 for (i = 0; i < length; i++) {
12275 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012276
Benjamin Peterson29060642009-01-31 22:14:21 +000012277 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012278 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012279 else if (!cased && Py_UNICODE_ISUPPER(ch))
12280 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012281 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012282 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012283}
12284
INADA Naoki3ae20562017-01-16 20:41:20 +090012285/*[clinic input]
12286str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000012287
INADA Naoki3ae20562017-01-16 20:41:20 +090012288Return True if the string is a title-cased string, False otherwise.
12289
12290In a title-cased string, upper- and title-case characters may only
12291follow uncased characters and lowercase characters only cased ones.
12292[clinic start generated code]*/
12293
12294static PyObject *
12295unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012296/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012297{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012298 Py_ssize_t i, length;
12299 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012300 const void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012301 int cased, previous_is_cased;
12302
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012303 if (PyUnicode_READY(self) == -1)
12304 return NULL;
12305 length = PyUnicode_GET_LENGTH(self);
12306 kind = PyUnicode_KIND(self);
12307 data = PyUnicode_DATA(self);
12308
Guido van Rossumd57fd912000-03-10 22:53:23 +000012309 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012310 if (length == 1) {
12311 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12312 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
12313 (Py_UNICODE_ISUPPER(ch) != 0));
12314 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012315
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012316 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012317 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012318 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012319
Guido van Rossumd57fd912000-03-10 22:53:23 +000012320 cased = 0;
12321 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012322 for (i = 0; i < length; i++) {
12323 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000012324
Benjamin Peterson29060642009-01-31 22:14:21 +000012325 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
12326 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012327 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012328 previous_is_cased = 1;
12329 cased = 1;
12330 }
12331 else if (Py_UNICODE_ISLOWER(ch)) {
12332 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012333 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012334 previous_is_cased = 1;
12335 cased = 1;
12336 }
12337 else
12338 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012339 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012340 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012341}
12342
INADA Naoki3ae20562017-01-16 20:41:20 +090012343/*[clinic input]
12344str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012345
INADA Naoki3ae20562017-01-16 20:41:20 +090012346Return True if the string is a whitespace string, False otherwise.
12347
12348A string is whitespace if all characters in the string are whitespace and there
12349is at least one character in the string.
12350[clinic start generated code]*/
12351
12352static PyObject *
12353unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012354/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012355{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012356 Py_ssize_t i, length;
12357 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012358 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012359
12360 if (PyUnicode_READY(self) == -1)
12361 return NULL;
12362 length = PyUnicode_GET_LENGTH(self);
12363 kind = PyUnicode_KIND(self);
12364 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012365
Guido van Rossumd57fd912000-03-10 22:53:23 +000012366 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012367 if (length == 1)
12368 return PyBool_FromLong(
12369 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012370
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012371 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012372 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012373 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012374
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012375 for (i = 0; i < length; i++) {
12376 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012377 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012378 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012379 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012380 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012381}
12382
INADA Naoki3ae20562017-01-16 20:41:20 +090012383/*[clinic input]
12384str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012385
INADA Naoki3ae20562017-01-16 20:41:20 +090012386Return True if the string is an alphabetic string, False otherwise.
12387
12388A string is alphabetic if all characters in the string are alphabetic and there
12389is at least one character in the string.
12390[clinic start generated code]*/
12391
12392static PyObject *
12393unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012394/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012395{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012396 Py_ssize_t i, length;
12397 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012398 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012399
12400 if (PyUnicode_READY(self) == -1)
12401 return NULL;
12402 length = PyUnicode_GET_LENGTH(self);
12403 kind = PyUnicode_KIND(self);
12404 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012405
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012406 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012407 if (length == 1)
12408 return PyBool_FromLong(
12409 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012410
12411 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012412 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012413 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012414
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012415 for (i = 0; i < length; i++) {
12416 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012417 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012418 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012419 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012420}
12421
INADA Naoki3ae20562017-01-16 20:41:20 +090012422/*[clinic input]
12423str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012424
INADA Naoki3ae20562017-01-16 20:41:20 +090012425Return True if the string is an alpha-numeric string, False otherwise.
12426
12427A string is alpha-numeric if all characters in the string are alpha-numeric and
12428there is at least one character in the string.
12429[clinic start generated code]*/
12430
12431static PyObject *
12432unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012433/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012434{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012435 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012436 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012437 Py_ssize_t len, i;
12438
12439 if (PyUnicode_READY(self) == -1)
12440 return NULL;
12441
12442 kind = PyUnicode_KIND(self);
12443 data = PyUnicode_DATA(self);
12444 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012445
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012446 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012447 if (len == 1) {
12448 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12449 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12450 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012451
12452 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012453 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012454 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012455
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012456 for (i = 0; i < len; i++) {
12457 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012458 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012459 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012460 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012461 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012462}
12463
INADA Naoki3ae20562017-01-16 20:41:20 +090012464/*[clinic input]
12465str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012466
INADA Naoki3ae20562017-01-16 20:41:20 +090012467Return True if the string is a decimal string, False otherwise.
12468
12469A string is a decimal string if all characters in the string are decimal and
12470there is at least one character in the string.
12471[clinic start generated code]*/
12472
12473static PyObject *
12474unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012475/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012476{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012477 Py_ssize_t i, length;
12478 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012479 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012480
12481 if (PyUnicode_READY(self) == -1)
12482 return NULL;
12483 length = PyUnicode_GET_LENGTH(self);
12484 kind = PyUnicode_KIND(self);
12485 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012486
Guido van Rossumd57fd912000-03-10 22:53:23 +000012487 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012488 if (length == 1)
12489 return PyBool_FromLong(
12490 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012491
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012492 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012493 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012494 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012495
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012496 for (i = 0; i < length; i++) {
12497 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012498 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012499 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012500 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012501}
12502
INADA Naoki3ae20562017-01-16 20:41:20 +090012503/*[clinic input]
12504str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012505
INADA Naoki3ae20562017-01-16 20:41:20 +090012506Return True if the string is a digit string, False otherwise.
12507
12508A string is a digit string if all characters in the string are digits and there
12509is at least one character in the string.
12510[clinic start generated code]*/
12511
12512static PyObject *
12513unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012514/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012515{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012516 Py_ssize_t i, length;
12517 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012518 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012519
12520 if (PyUnicode_READY(self) == -1)
12521 return NULL;
12522 length = PyUnicode_GET_LENGTH(self);
12523 kind = PyUnicode_KIND(self);
12524 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525
Guido van Rossumd57fd912000-03-10 22:53:23 +000012526 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012527 if (length == 1) {
12528 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12529 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12530 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012532 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012533 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012534 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012535
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012536 for (i = 0; i < length; i++) {
12537 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012538 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012539 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012540 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012541}
12542
INADA Naoki3ae20562017-01-16 20:41:20 +090012543/*[clinic input]
12544str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012545
INADA Naoki3ae20562017-01-16 20:41:20 +090012546Return True if the string is a numeric string, False otherwise.
12547
12548A string is numeric if all characters in the string are numeric and there is at
12549least one character in the string.
12550[clinic start generated code]*/
12551
12552static PyObject *
12553unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012554/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012555{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012556 Py_ssize_t i, length;
12557 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012558 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012559
12560 if (PyUnicode_READY(self) == -1)
12561 return NULL;
12562 length = PyUnicode_GET_LENGTH(self);
12563 kind = PyUnicode_KIND(self);
12564 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012565
Guido van Rossumd57fd912000-03-10 22:53:23 +000012566 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012567 if (length == 1)
12568 return PyBool_FromLong(
12569 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012570
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012571 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012572 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012573 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012574
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012575 for (i = 0; i < length; i++) {
12576 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012577 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012578 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012579 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012580}
12581
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012582Py_ssize_t
12583_PyUnicode_ScanIdentifier(PyObject *self)
Martin v. Löwis47383402007-08-15 07:32:56 +000012584{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012585 Py_ssize_t i;
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012586 if (PyUnicode_READY(self) == -1)
12587 return -1;
Martin v. Löwis47383402007-08-15 07:32:56 +000012588
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012589 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012590 if (len == 0) {
12591 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012592 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012593 }
12594
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012595 int kind = PyUnicode_KIND(self);
12596 const void *data = PyUnicode_DATA(self);
12597 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Martin v. Löwis47383402007-08-15 07:32:56 +000012598 /* PEP 3131 says that the first character must be in
12599 XID_Start and subsequent characters in XID_Continue,
12600 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012601 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012602 letters, digits, underscore). However, given the current
12603 definition of XID_Start and XID_Continue, it is sufficient
12604 to check just for these, except that _ must be allowed
12605 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012606 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012607 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012608 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012609
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012610 for (i = 1; i < len; i++) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012611 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012612 if (!_PyUnicode_IsXidContinue(ch)) {
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012613 return i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012614 }
12615 }
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012616 return i;
12617}
12618
12619int
12620PyUnicode_IsIdentifier(PyObject *self)
12621{
12622 if (PyUnicode_IS_READY(self)) {
12623 Py_ssize_t i = _PyUnicode_ScanIdentifier(self);
12624 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
12625 /* an empty string is not a valid identifier */
12626 return len && i == len;
12627 }
12628 else {
Inada Naoki2c4928d2020-06-17 20:09:44 +090012629_Py_COMP_DIAG_PUSH
12630_Py_COMP_DIAG_IGNORE_DEPR_DECLS
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012631 Py_ssize_t i = 0, len = PyUnicode_GET_SIZE(self);
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012632 if (len == 0) {
12633 /* an empty string is not a valid identifier */
12634 return 0;
12635 }
12636
12637 const wchar_t *wstr = _PyUnicode_WSTR(self);
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012638 Py_UCS4 ch = wstr[i++];
12639#if SIZEOF_WCHAR_T == 2
12640 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12641 && i < len
12642 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12643 {
12644 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12645 i++;
12646 }
12647#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012648 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
12649 return 0;
12650 }
12651
Serhiy Storchaka5650e762020-05-12 16:18:00 +030012652 while (i < len) {
12653 ch = wstr[i++];
12654#if SIZEOF_WCHAR_T == 2
12655 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
12656 && i < len
12657 && Py_UNICODE_IS_LOW_SURROGATE(wstr[i]))
12658 {
12659 ch = Py_UNICODE_JOIN_SURROGATES(ch, wstr[i]);
12660 i++;
12661 }
12662#endif
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012663 if (!_PyUnicode_IsXidContinue(ch)) {
12664 return 0;
12665 }
12666 }
12667 return 1;
Inada Naoki2c4928d2020-06-17 20:09:44 +090012668_Py_COMP_DIAG_POP
Serhiy Storchaka74ea6b52020-05-12 12:42:04 +030012669 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012670}
12671
INADA Naoki3ae20562017-01-16 20:41:20 +090012672/*[clinic input]
12673str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012674
INADA Naoki3ae20562017-01-16 20:41:20 +090012675Return True if the string is a valid Python identifier, False otherwise.
12676
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012677Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012678such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012679[clinic start generated code]*/
12680
12681static PyObject *
12682unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012683/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012684{
12685 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12686}
12687
INADA Naoki3ae20562017-01-16 20:41:20 +090012688/*[clinic input]
12689str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012690
INADA Naoki3ae20562017-01-16 20:41:20 +090012691Return True if the string is printable, False otherwise.
12692
12693A string is printable if all of its characters are considered printable in
12694repr() or if it is empty.
12695[clinic start generated code]*/
12696
12697static PyObject *
12698unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012699/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012700{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012701 Py_ssize_t i, length;
12702 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012703 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012704
12705 if (PyUnicode_READY(self) == -1)
12706 return NULL;
12707 length = PyUnicode_GET_LENGTH(self);
12708 kind = PyUnicode_KIND(self);
12709 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012710
12711 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012712 if (length == 1)
12713 return PyBool_FromLong(
12714 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012715
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012716 for (i = 0; i < length; i++) {
12717 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012718 Py_RETURN_FALSE;
12719 }
12720 }
12721 Py_RETURN_TRUE;
12722}
12723
INADA Naoki3ae20562017-01-16 20:41:20 +090012724/*[clinic input]
12725str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012726
INADA Naoki3ae20562017-01-16 20:41:20 +090012727 iterable: object
12728 /
12729
12730Concatenate any number of strings.
12731
Martin Panter91a88662017-01-24 00:30:06 +000012732The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012733The result is returned as a new string.
12734
12735Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12736[clinic start generated code]*/
12737
12738static PyObject *
12739unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012740/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012741{
INADA Naoki3ae20562017-01-16 20:41:20 +090012742 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012743}
12744
Martin v. Löwis18e16552006-02-15 17:27:45 +000012745static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012746unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012747{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012748 if (PyUnicode_READY(self) == -1)
12749 return -1;
12750 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012751}
12752
INADA Naoki3ae20562017-01-16 20:41:20 +090012753/*[clinic input]
12754str.ljust as unicode_ljust
12755
12756 width: Py_ssize_t
12757 fillchar: Py_UCS4 = ' '
12758 /
12759
12760Return a left-justified string of length width.
12761
12762Padding is done using the specified fill character (default is a space).
12763[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012764
12765static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012766unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12767/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012768{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012769 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012770 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012771
Victor Stinnerc4b49542011-12-11 22:44:26 +010012772 if (PyUnicode_GET_LENGTH(self) >= width)
12773 return unicode_result_unchanged(self);
12774
12775 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012776}
12777
INADA Naoki3ae20562017-01-16 20:41:20 +090012778/*[clinic input]
12779str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012780
INADA Naoki3ae20562017-01-16 20:41:20 +090012781Return a copy of the string converted to lowercase.
12782[clinic start generated code]*/
12783
12784static PyObject *
12785unicode_lower_impl(PyObject *self)
12786/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012787{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012788 if (PyUnicode_READY(self) == -1)
12789 return NULL;
12790 if (PyUnicode_IS_ASCII(self))
12791 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012792 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012793}
12794
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012795#define LEFTSTRIP 0
12796#define RIGHTSTRIP 1
12797#define BOTHSTRIP 2
12798
12799/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012800static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012801
INADA Naoki3ae20562017-01-16 20:41:20 +090012802#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012803
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012804/* externally visible for str.strip(unicode) */
12805PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012806_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012807{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012808 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012809 int kind;
12810 Py_ssize_t i, j, len;
12811 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012812 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012814 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12815 return NULL;
12816
12817 kind = PyUnicode_KIND(self);
12818 data = PyUnicode_DATA(self);
12819 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012820 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012821 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12822 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012823 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012824
Benjamin Peterson14339b62009-01-31 16:36:08 +000012825 i = 0;
12826 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012827 while (i < len) {
12828 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12829 if (!BLOOM(sepmask, ch))
12830 break;
12831 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12832 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012833 i++;
12834 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012835 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012836
Benjamin Peterson14339b62009-01-31 16:36:08 +000012837 j = len;
12838 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012839 j--;
12840 while (j >= i) {
12841 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12842 if (!BLOOM(sepmask, ch))
12843 break;
12844 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12845 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012846 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012847 }
12848
Benjamin Peterson29060642009-01-31 22:14:21 +000012849 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012850 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012851
Victor Stinner7931d9a2011-11-04 00:22:48 +010012852 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012853}
12854
12855PyObject*
12856PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12857{
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012858 const unsigned char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012859 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012860 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012861
Victor Stinnerde636f32011-10-01 03:55:54 +020012862 if (PyUnicode_READY(self) == -1)
12863 return NULL;
12864
Victor Stinner684d5fd2012-05-03 02:32:34 +020012865 length = PyUnicode_GET_LENGTH(self);
12866 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012867
Victor Stinner684d5fd2012-05-03 02:32:34 +020012868 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012869 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012870
Victor Stinnerde636f32011-10-01 03:55:54 +020012871 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012872 PyErr_SetString(PyExc_IndexError, "string index out of range");
12873 return NULL;
12874 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012875 if (start >= length || end < start)
12876 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012877
Victor Stinner684d5fd2012-05-03 02:32:34 +020012878 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012879 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012880 data = PyUnicode_1BYTE_DATA(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012881 return _PyUnicode_FromASCII((const char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012882 }
12883 else {
12884 kind = PyUnicode_KIND(self);
12885 data = PyUnicode_1BYTE_DATA(self);
12886 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012887 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012888 length);
12889 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012890}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012891
12892static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012893do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012894{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012895 Py_ssize_t len, i, j;
12896
12897 if (PyUnicode_READY(self) == -1)
12898 return NULL;
12899
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012900 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012901
Victor Stinnercc7af722013-04-09 22:39:24 +020012902 if (PyUnicode_IS_ASCII(self)) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012903 const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
Victor Stinnercc7af722013-04-09 22:39:24 +020012904
12905 i = 0;
12906 if (striptype != RIGHTSTRIP) {
12907 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012908 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012909 if (!_Py_ascii_whitespace[ch])
12910 break;
12911 i++;
12912 }
12913 }
12914
12915 j = len;
12916 if (striptype != LEFTSTRIP) {
12917 j--;
12918 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012919 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012920 if (!_Py_ascii_whitespace[ch])
12921 break;
12922 j--;
12923 }
12924 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012925 }
12926 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012927 else {
12928 int kind = PyUnicode_KIND(self);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030012929 const void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012930
Victor Stinnercc7af722013-04-09 22:39:24 +020012931 i = 0;
12932 if (striptype != RIGHTSTRIP) {
12933 while (i < len) {
12934 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12935 if (!Py_UNICODE_ISSPACE(ch))
12936 break;
12937 i++;
12938 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012939 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012940
12941 j = len;
12942 if (striptype != LEFTSTRIP) {
12943 j--;
12944 while (j >= i) {
12945 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12946 if (!Py_UNICODE_ISSPACE(ch))
12947 break;
12948 j--;
12949 }
12950 j++;
12951 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012952 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012953
Victor Stinner7931d9a2011-11-04 00:22:48 +010012954 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012955}
12956
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012957
12958static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012959do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012960{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012961 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012962 if (PyUnicode_Check(sep))
12963 return _PyUnicode_XStrip(self, striptype, sep);
12964 else {
12965 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012966 "%s arg must be None or str",
12967 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012968 return NULL;
12969 }
12970 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012971
Benjamin Peterson14339b62009-01-31 16:36:08 +000012972 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012973}
12974
12975
INADA Naoki3ae20562017-01-16 20:41:20 +090012976/*[clinic input]
12977str.strip as unicode_strip
12978
12979 chars: object = None
12980 /
12981
Zachary Ware09895c22019-10-09 16:09:00 -050012982Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012983
12984If chars is given and not None, remove characters in chars instead.
12985[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012986
12987static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012988unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012989/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012990{
INADA Naoki3ae20562017-01-16 20:41:20 +090012991 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012992}
12993
12994
INADA Naoki3ae20562017-01-16 20:41:20 +090012995/*[clinic input]
12996str.lstrip as unicode_lstrip
12997
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012998 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012999 /
13000
13001Return a copy of the string with leading whitespace removed.
13002
13003If chars is given and not None, remove characters in chars instead.
13004[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013005
13006static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013007unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030013008/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013009{
INADA Naoki3ae20562017-01-16 20:41:20 +090013010 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013011}
13012
13013
INADA Naoki3ae20562017-01-16 20:41:20 +090013014/*[clinic input]
13015str.rstrip as unicode_rstrip
13016
Serhiy Storchaka279f4462019-09-14 12:24:05 +030013017 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090013018 /
13019
13020Return a copy of the string with trailing whitespace removed.
13021
13022If chars is given and not None, remove characters in chars instead.
13023[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013024
13025static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013026unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030013027/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013028{
INADA Naoki3ae20562017-01-16 20:41:20 +090013029 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000013030}
13031
13032
Guido van Rossumd57fd912000-03-10 22:53:23 +000013033static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013034unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013035{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013036 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013037 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013038
Serhiy Storchaka05997252013-01-26 12:14:02 +020013039 if (len < 1)
13040 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000013041
Victor Stinnerc4b49542011-12-11 22:44:26 +010013042 /* no repeat, return original string */
13043 if (len == 1)
13044 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000013045
Benjamin Petersonbac79492012-01-14 13:34:47 -050013046 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013047 return NULL;
13048
Victor Stinnerc759f3e2011-10-01 03:09:58 +020013049 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020013050 PyErr_SetString(PyExc_OverflowError,
13051 "repeated string is too long");
13052 return NULL;
13053 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013054 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020013055
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013056 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000013057 if (!u)
13058 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020013059 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000013060
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013061 if (PyUnicode_GET_LENGTH(str) == 1) {
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013062 int kind = PyUnicode_KIND(str);
13063 Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010013064 if (kind == PyUnicode_1BYTE_KIND) {
13065 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020013066 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010013067 }
13068 else if (kind == PyUnicode_2BYTE_KIND) {
13069 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020013070 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010013071 ucs2[n] = fill_char;
13072 } else {
13073 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
13074 assert(kind == PyUnicode_4BYTE_KIND);
13075 for (n = 0; n < len; ++n)
13076 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020013077 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013078 }
13079 else {
13080 /* number of characters copied this far */
13081 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013082 Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013083 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020013084 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013085 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000013086 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013087 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020013088 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013089 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000013090 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000013091 }
13092
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013093 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013094 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013095}
13096
Alexander Belopolsky40018472011-02-26 01:02:56 +000013097PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013098PyUnicode_Replace(PyObject *str,
13099 PyObject *substr,
13100 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000013101 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013102{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013103 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
13104 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013105 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013106 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013107}
13108
INADA Naoki3ae20562017-01-16 20:41:20 +090013109/*[clinic input]
13110str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000013111
INADA Naoki3ae20562017-01-16 20:41:20 +090013112 old: unicode
13113 new: unicode
13114 count: Py_ssize_t = -1
13115 Maximum number of occurrences to replace.
13116 -1 (the default value) means replace all occurrences.
13117 /
13118
13119Return a copy with all occurrences of substring old replaced by new.
13120
13121If the optional argument count is given, only the first count occurrences are
13122replaced.
13123[clinic start generated code]*/
13124
13125static PyObject *
13126unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
13127 Py_ssize_t count)
13128/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013129{
Benjamin Peterson22a29702012-01-02 09:00:30 -060013130 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000013131 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090013132 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013133}
13134
sweeneydea81849b2020-04-22 17:05:48 -040013135/*[clinic input]
13136str.removeprefix as unicode_removeprefix
13137
13138 prefix: unicode
13139 /
13140
13141Return a str with the given prefix string removed if present.
13142
13143If the string starts with the prefix string, return string[len(prefix):].
13144Otherwise, return a copy of the original string.
13145[clinic start generated code]*/
13146
13147static PyObject *
13148unicode_removeprefix_impl(PyObject *self, PyObject *prefix)
13149/*[clinic end generated code: output=f1e5945e9763bcb9 input=27ec40b99a37eb88]*/
13150{
13151 int match = tailmatch(self, prefix, 0, PY_SSIZE_T_MAX, -1);
13152 if (match == -1) {
13153 return NULL;
13154 }
13155 if (match) {
13156 return PyUnicode_Substring(self, PyUnicode_GET_LENGTH(prefix),
13157 PyUnicode_GET_LENGTH(self));
13158 }
13159 return unicode_result_unchanged(self);
13160}
13161
13162/*[clinic input]
13163str.removesuffix as unicode_removesuffix
13164
13165 suffix: unicode
13166 /
13167
13168Return a str with the given suffix string removed if present.
13169
13170If the string ends with the suffix string and that suffix is not empty,
13171return string[:-len(suffix)]. Otherwise, return a copy of the original
13172string.
13173[clinic start generated code]*/
13174
13175static PyObject *
13176unicode_removesuffix_impl(PyObject *self, PyObject *suffix)
13177/*[clinic end generated code: output=d36629e227636822 input=12cc32561e769be4]*/
13178{
13179 int match = tailmatch(self, suffix, 0, PY_SSIZE_T_MAX, +1);
13180 if (match == -1) {
13181 return NULL;
13182 }
13183 if (match) {
13184 return PyUnicode_Substring(self, 0, PyUnicode_GET_LENGTH(self)
13185 - PyUnicode_GET_LENGTH(suffix));
13186 }
13187 return unicode_result_unchanged(self);
13188}
13189
Alexander Belopolsky40018472011-02-26 01:02:56 +000013190static PyObject *
13191unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013192{
Walter Dörwald79e913e2007-05-12 11:08:06 +000013193 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013194 Py_ssize_t isize;
13195 Py_ssize_t osize, squote, dquote, i, o;
13196 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020013197 int ikind, okind, unchanged;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013198 const void *idata;
13199 void *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000013200
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013201 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000013202 return NULL;
13203
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013204 isize = PyUnicode_GET_LENGTH(unicode);
13205 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000013206
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013207 /* Compute length of output, quote characters, and
13208 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020013209 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013210 max = 127;
13211 squote = dquote = 0;
13212 ikind = PyUnicode_KIND(unicode);
13213 for (i = 0; i < isize; i++) {
13214 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040013215 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013216 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040013217 case '\'': squote++; break;
13218 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013219 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040013220 incr = 2;
13221 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013222 default:
13223 /* Fast-path ASCII */
13224 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013225 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013226 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013227 ;
13228 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013229 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013230 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013231 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013232 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040013233 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013234 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040013235 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013236 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040013237 if (osize > PY_SSIZE_T_MAX - incr) {
13238 PyErr_SetString(PyExc_OverflowError,
13239 "string is too long to generate repr");
13240 return NULL;
13241 }
13242 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013243 }
13244
13245 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020013246 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013247 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020013248 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013249 if (dquote)
13250 /* Both squote and dquote present. Use squote,
13251 and escape them */
13252 osize += squote;
13253 else
13254 quote = '"';
13255 }
Victor Stinner55c08782013-04-14 18:45:39 +020013256 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013257
13258 repr = PyUnicode_New(osize, max);
13259 if (repr == NULL)
13260 return NULL;
13261 okind = PyUnicode_KIND(repr);
13262 odata = PyUnicode_DATA(repr);
13263
13264 PyUnicode_WRITE(okind, odata, 0, quote);
13265 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020013266 if (unchanged) {
13267 _PyUnicode_FastCopyCharacters(repr, 1,
13268 unicode, 0,
13269 isize);
13270 }
13271 else {
13272 for (i = 0, o = 1; i < isize; i++) {
13273 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013274
Victor Stinner55c08782013-04-14 18:45:39 +020013275 /* Escape quotes and backslashes */
13276 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000013277 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013278 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020013279 continue;
13280 }
13281
13282 /* Map special whitespace to '\t', \n', '\r' */
13283 if (ch == '\t') {
13284 PyUnicode_WRITE(okind, odata, o++, '\\');
13285 PyUnicode_WRITE(okind, odata, o++, 't');
13286 }
13287 else if (ch == '\n') {
13288 PyUnicode_WRITE(okind, odata, o++, '\\');
13289 PyUnicode_WRITE(okind, odata, o++, 'n');
13290 }
13291 else if (ch == '\r') {
13292 PyUnicode_WRITE(okind, odata, o++, '\\');
13293 PyUnicode_WRITE(okind, odata, o++, 'r');
13294 }
13295
13296 /* Map non-printable US ASCII to '\xhh' */
13297 else if (ch < ' ' || ch == 0x7F) {
13298 PyUnicode_WRITE(okind, odata, o++, '\\');
13299 PyUnicode_WRITE(okind, odata, o++, 'x');
13300 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13301 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13302 }
13303
13304 /* Copy ASCII characters as-is */
13305 else if (ch < 0x7F) {
13306 PyUnicode_WRITE(okind, odata, o++, ch);
13307 }
13308
13309 /* Non-ASCII characters */
13310 else {
13311 /* Map Unicode whitespace and control characters
13312 (categories Z* and C* except ASCII space)
13313 */
13314 if (!Py_UNICODE_ISPRINTABLE(ch)) {
13315 PyUnicode_WRITE(okind, odata, o++, '\\');
13316 /* Map 8-bit characters to '\xhh' */
13317 if (ch <= 0xff) {
13318 PyUnicode_WRITE(okind, odata, o++, 'x');
13319 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
13320 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
13321 }
13322 /* Map 16-bit characters to '\uxxxx' */
13323 else if (ch <= 0xffff) {
13324 PyUnicode_WRITE(okind, odata, o++, 'u');
13325 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13326 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13327 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13328 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13329 }
13330 /* Map 21-bit characters to '\U00xxxxxx' */
13331 else {
13332 PyUnicode_WRITE(okind, odata, o++, 'U');
13333 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
13334 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
13335 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
13336 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
13337 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
13338 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
13339 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
13340 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
13341 }
13342 }
13343 /* Copy characters as-is */
13344 else {
13345 PyUnicode_WRITE(okind, odata, o++, ch);
13346 }
Georg Brandl559e5d72008-06-11 18:37:52 +000013347 }
13348 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000013349 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013350 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020013351 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000013352 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013353}
13354
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013355PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013356 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013357\n\
13358Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080013359such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013360arguments start and end are interpreted as in slice notation.\n\
13361\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013362Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013363
13364static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013365unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013366{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013367 /* initialize variables to prevent gcc warning */
13368 PyObject *substring = NULL;
13369 Py_ssize_t start = 0;
13370 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013371 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013372
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013373 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013374 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013375
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013376 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013377 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013378
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013379 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013380
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013381 if (result == -2)
13382 return NULL;
13383
Christian Heimes217cfd12007-12-02 14:31:20 +000013384 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013385}
13386
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013387PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013388 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013389\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070013390Return the highest index in S where substring sub is found,\n\
13391such that sub is contained within S[start:end]. Optional\n\
13392arguments start and end are interpreted as in slice notation.\n\
13393\n\
13394Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013395
13396static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013397unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013398{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010013399 /* initialize variables to prevent gcc warning */
13400 PyObject *substring = NULL;
13401 Py_ssize_t start = 0;
13402 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013403 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013404
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030013405 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013406 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013407
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013408 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013409 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013410
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013411 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013412
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013413 if (result == -2)
13414 return NULL;
13415
Guido van Rossumd57fd912000-03-10 22:53:23 +000013416 if (result < 0) {
13417 PyErr_SetString(PyExc_ValueError, "substring not found");
13418 return NULL;
13419 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013420
Christian Heimes217cfd12007-12-02 14:31:20 +000013421 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013422}
13423
INADA Naoki3ae20562017-01-16 20:41:20 +090013424/*[clinic input]
13425str.rjust as unicode_rjust
13426
13427 width: Py_ssize_t
13428 fillchar: Py_UCS4 = ' '
13429 /
13430
13431Return a right-justified string of length width.
13432
13433Padding is done using the specified fill character (default is a space).
13434[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013435
13436static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013437unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13438/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013439{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013440 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013441 return NULL;
13442
Victor Stinnerc4b49542011-12-11 22:44:26 +010013443 if (PyUnicode_GET_LENGTH(self) >= width)
13444 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013445
Victor Stinnerc4b49542011-12-11 22:44:26 +010013446 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013447}
13448
Alexander Belopolsky40018472011-02-26 01:02:56 +000013449PyObject *
13450PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013451{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013452 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013453 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013454
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013455 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013456}
13457
INADA Naoki3ae20562017-01-16 20:41:20 +090013458/*[clinic input]
13459str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013460
INADA Naoki3ae20562017-01-16 20:41:20 +090013461 sep: object = None
13462 The delimiter according which to split the string.
13463 None (the default value) means split according to any whitespace,
13464 and discard empty strings from the result.
13465 maxsplit: Py_ssize_t = -1
13466 Maximum number of splits to do.
13467 -1 (the default value) means no limit.
13468
13469Return a list of the words in the string, using sep as the delimiter string.
13470[clinic start generated code]*/
13471
13472static PyObject *
13473unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13474/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013475{
INADA Naoki3ae20562017-01-16 20:41:20 +090013476 if (sep == Py_None)
13477 return split(self, NULL, maxsplit);
13478 if (PyUnicode_Check(sep))
13479 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013480
Victor Stinner998b8062018-09-12 00:23:25 +020013481 PyErr_Format(PyExc_TypeError,
13482 "must be str or None, not %.100s",
13483 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013484 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013485}
13486
Thomas Wouters477c8d52006-05-27 19:21:47 +000013487PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013488PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013489{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013490 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013491 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013492 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013493 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013494
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013495 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013496 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013497
Victor Stinner14f8f022011-10-05 20:58:25 +020013498 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013499 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013500 len1 = PyUnicode_GET_LENGTH(str_obj);
13501 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013502 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013503 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013504 return PyTuple_Pack(3, str_obj, empty, empty);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013505 }
13506 buf1 = PyUnicode_DATA(str_obj);
13507 buf2 = PyUnicode_DATA(sep_obj);
13508 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013509 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013510 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013511 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013512 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013513
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013514 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013515 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013516 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13517 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13518 else
13519 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013520 break;
13521 case PyUnicode_2BYTE_KIND:
13522 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13523 break;
13524 case PyUnicode_4BYTE_KIND:
13525 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13526 break;
13527 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013528 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013529 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013530
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013531 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013532 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013533 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013534
13535 return out;
13536}
13537
13538
13539PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013540PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013541{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013542 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013543 int kind1, kind2;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013544 const void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013545 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013546
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013547 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013548 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013549
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013550 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013551 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013552 len1 = PyUnicode_GET_LENGTH(str_obj);
13553 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013554 if (kind1 < kind2 || len1 < len2) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020013555 PyObject *empty = unicode_get_empty(); // Borrowed reference
Victor Stinner90ed8a62020-06-24 00:34:07 +020013556 return PyTuple_Pack(3, empty, empty, str_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013557 }
13558 buf1 = PyUnicode_DATA(str_obj);
13559 buf2 = PyUnicode_DATA(sep_obj);
13560 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013561 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013562 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013563 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013564 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013565
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013566 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013567 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013568 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13569 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13570 else
13571 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013572 break;
13573 case PyUnicode_2BYTE_KIND:
13574 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13575 break;
13576 case PyUnicode_4BYTE_KIND:
13577 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13578 break;
13579 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013580 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013581 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013582
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013583 assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(sep_obj)));
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013584 if (kind2 != kind1)
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013585 PyMem_Free((void *)buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013586
13587 return out;
13588}
13589
INADA Naoki3ae20562017-01-16 20:41:20 +090013590/*[clinic input]
13591str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013592
INADA Naoki3ae20562017-01-16 20:41:20 +090013593 sep: object
13594 /
13595
13596Partition the string into three parts using the given separator.
13597
13598This will search for the separator in the string. If the separator is found,
13599returns a 3-tuple containing the part before the separator, the separator
13600itself, and the part after it.
13601
13602If the separator is not found, returns a 3-tuple containing the original string
13603and two empty strings.
13604[clinic start generated code]*/
13605
13606static PyObject *
13607unicode_partition(PyObject *self, PyObject *sep)
13608/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013609{
INADA Naoki3ae20562017-01-16 20:41:20 +090013610 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013611}
13612
INADA Naoki3ae20562017-01-16 20:41:20 +090013613/*[clinic input]
13614str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013615
INADA Naoki3ae20562017-01-16 20:41:20 +090013616Partition the string into three parts using the given separator.
13617
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013618This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013619the separator is found, returns a 3-tuple containing the part before the
13620separator, the separator itself, and the part after it.
13621
13622If the separator is not found, returns a 3-tuple containing two empty strings
13623and the original string.
13624[clinic start generated code]*/
13625
13626static PyObject *
13627unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013628/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013629{
INADA Naoki3ae20562017-01-16 20:41:20 +090013630 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013631}
13632
Alexander Belopolsky40018472011-02-26 01:02:56 +000013633PyObject *
13634PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013635{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013636 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013637 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013638
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013639 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013640}
13641
INADA Naoki3ae20562017-01-16 20:41:20 +090013642/*[clinic input]
13643str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013644
INADA Naoki3ae20562017-01-16 20:41:20 +090013645Return a list of the words in the string, using sep as the delimiter string.
13646
13647Splits are done starting at the end of the string and working to the front.
13648[clinic start generated code]*/
13649
13650static PyObject *
13651unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13652/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013653{
INADA Naoki3ae20562017-01-16 20:41:20 +090013654 if (sep == Py_None)
13655 return rsplit(self, NULL, maxsplit);
13656 if (PyUnicode_Check(sep))
13657 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013658
Victor Stinner998b8062018-09-12 00:23:25 +020013659 PyErr_Format(PyExc_TypeError,
13660 "must be str or None, not %.100s",
13661 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013662 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013663}
13664
INADA Naoki3ae20562017-01-16 20:41:20 +090013665/*[clinic input]
13666str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013667
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013668 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013669
13670Return a list of the lines in the string, breaking at line boundaries.
13671
13672Line breaks are not included in the resulting list unless keepends is given and
13673true.
13674[clinic start generated code]*/
13675
13676static PyObject *
13677unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013678/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013679{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013680 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013681}
13682
13683static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013684PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013685{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013686 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013687}
13688
INADA Naoki3ae20562017-01-16 20:41:20 +090013689/*[clinic input]
13690str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013691
INADA Naoki3ae20562017-01-16 20:41:20 +090013692Convert uppercase characters to lowercase and lowercase characters to uppercase.
13693[clinic start generated code]*/
13694
13695static PyObject *
13696unicode_swapcase_impl(PyObject *self)
13697/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013698{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013699 if (PyUnicode_READY(self) == -1)
13700 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013701 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013702}
13703
Larry Hastings61272b72014-01-07 12:41:53 -080013704/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013705
Larry Hastings31826802013-10-19 00:09:25 -070013706@staticmethod
13707str.maketrans as unicode_maketrans
13708
13709 x: object
13710
13711 y: unicode=NULL
13712
13713 z: unicode=NULL
13714
13715 /
13716
13717Return a translation table usable for str.translate().
13718
13719If there is only one argument, it must be a dictionary mapping Unicode
13720ordinals (integers) or characters to Unicode ordinals, strings or None.
13721Character keys will be then converted to ordinals.
13722If there are two arguments, they must be strings of equal length, and
13723in the resulting dictionary, each character in x will be mapped to the
13724character at the same position in y. If there is a third argument, it
13725must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013726[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013727
Larry Hastings31826802013-10-19 00:09:25 -070013728static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013729unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013730/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013731{
Georg Brandlceee0772007-11-27 23:48:05 +000013732 PyObject *new = NULL, *key, *value;
13733 Py_ssize_t i = 0;
13734 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013735
Georg Brandlceee0772007-11-27 23:48:05 +000013736 new = PyDict_New();
13737 if (!new)
13738 return NULL;
13739 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013740 int x_kind, y_kind, z_kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013741 const void *x_data, *y_data, *z_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013742
Georg Brandlceee0772007-11-27 23:48:05 +000013743 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013744 if (!PyUnicode_Check(x)) {
13745 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13746 "be a string if there is a second argument");
13747 goto err;
13748 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013749 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013750 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13751 "arguments must have equal length");
13752 goto err;
13753 }
13754 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013755 x_kind = PyUnicode_KIND(x);
13756 y_kind = PyUnicode_KIND(y);
13757 x_data = PyUnicode_DATA(x);
13758 y_data = PyUnicode_DATA(y);
13759 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13760 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013761 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013762 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013763 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013764 if (!value) {
13765 Py_DECREF(key);
13766 goto err;
13767 }
Georg Brandlceee0772007-11-27 23:48:05 +000013768 res = PyDict_SetItem(new, key, value);
13769 Py_DECREF(key);
13770 Py_DECREF(value);
13771 if (res < 0)
13772 goto err;
13773 }
13774 /* create entries for deleting chars in z */
13775 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013776 z_kind = PyUnicode_KIND(z);
13777 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013778 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013779 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013780 if (!key)
13781 goto err;
13782 res = PyDict_SetItem(new, key, Py_None);
13783 Py_DECREF(key);
13784 if (res < 0)
13785 goto err;
13786 }
13787 }
13788 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013789 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013790 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013791
Georg Brandlceee0772007-11-27 23:48:05 +000013792 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013793 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013794 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13795 "to maketrans it must be a dict");
13796 goto err;
13797 }
13798 /* copy entries into the new dict, converting string keys to int keys */
13799 while (PyDict_Next(x, &i, &key, &value)) {
13800 if (PyUnicode_Check(key)) {
13801 /* convert string keys to integer keys */
13802 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013803 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013804 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13805 "table must be of length 1");
13806 goto err;
13807 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013808 kind = PyUnicode_KIND(key);
13809 data = PyUnicode_DATA(key);
13810 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013811 if (!newkey)
13812 goto err;
13813 res = PyDict_SetItem(new, newkey, value);
13814 Py_DECREF(newkey);
13815 if (res < 0)
13816 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013817 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013818 /* just keep integer keys */
13819 if (PyDict_SetItem(new, key, value) < 0)
13820 goto err;
13821 } else {
13822 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13823 "be strings or integers");
13824 goto err;
13825 }
13826 }
13827 }
13828 return new;
13829 err:
13830 Py_DECREF(new);
13831 return NULL;
13832}
13833
INADA Naoki3ae20562017-01-16 20:41:20 +090013834/*[clinic input]
13835str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013836
INADA Naoki3ae20562017-01-16 20:41:20 +090013837 table: object
13838 Translation table, which must be a mapping of Unicode ordinals to
13839 Unicode ordinals, strings, or None.
13840 /
13841
13842Replace each character in the string using the given translation table.
13843
13844The table must implement lookup/indexing via __getitem__, for instance a
13845dictionary or list. If this operation raises LookupError, the character is
13846left untouched. Characters mapped to None are deleted.
13847[clinic start generated code]*/
13848
13849static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013850unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013851/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013852{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013853 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013854}
13855
INADA Naoki3ae20562017-01-16 20:41:20 +090013856/*[clinic input]
13857str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013858
INADA Naoki3ae20562017-01-16 20:41:20 +090013859Return a copy of the string converted to uppercase.
13860[clinic start generated code]*/
13861
13862static PyObject *
13863unicode_upper_impl(PyObject *self)
13864/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013865{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013866 if (PyUnicode_READY(self) == -1)
13867 return NULL;
13868 if (PyUnicode_IS_ASCII(self))
13869 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013870 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013871}
13872
INADA Naoki3ae20562017-01-16 20:41:20 +090013873/*[clinic input]
13874str.zfill as unicode_zfill
13875
13876 width: Py_ssize_t
13877 /
13878
13879Pad a numeric string with zeros on the left, to fill a field of the given width.
13880
13881The string is never truncated.
13882[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013883
13884static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013885unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013886/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013887{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013888 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013889 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013890 int kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030013891 const void *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013892 Py_UCS4 chr;
13893
Benjamin Petersonbac79492012-01-14 13:34:47 -050013894 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013895 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013896
Victor Stinnerc4b49542011-12-11 22:44:26 +010013897 if (PyUnicode_GET_LENGTH(self) >= width)
13898 return unicode_result_unchanged(self);
13899
13900 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013901
13902 u = pad(self, fill, 0, '0');
13903
Walter Dörwald068325e2002-04-15 13:36:47 +000013904 if (u == NULL)
13905 return NULL;
13906
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013907 kind = PyUnicode_KIND(u);
13908 data = PyUnicode_DATA(u);
13909 chr = PyUnicode_READ(kind, data, fill);
13910
13911 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013912 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013913 PyUnicode_WRITE(kind, data, 0, chr);
13914 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013915 }
13916
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013917 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013918 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013919}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013920
13921#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013922static PyObject *
13923unicode__decimal2ascii(PyObject *self)
13924{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013925 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013926}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013927#endif
13928
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013929PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013930 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013931\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013932Return True if S starts with the specified prefix, False otherwise.\n\
13933With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013934With optional end, stop comparing S at that position.\n\
13935prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013936
13937static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013938unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013939 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013940{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013941 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013942 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013943 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013944 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013945 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013946
Jesus Ceaac451502011-04-20 17:09:23 +020013947 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013948 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013949 if (PyTuple_Check(subobj)) {
13950 Py_ssize_t i;
13951 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013952 substring = PyTuple_GET_ITEM(subobj, i);
13953 if (!PyUnicode_Check(substring)) {
13954 PyErr_Format(PyExc_TypeError,
13955 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013956 "not %.100s",
13957 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013958 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013959 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013960 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013961 if (result == -1)
13962 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013963 if (result) {
13964 Py_RETURN_TRUE;
13965 }
13966 }
13967 /* nothing matched */
13968 Py_RETURN_FALSE;
13969 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013970 if (!PyUnicode_Check(subobj)) {
13971 PyErr_Format(PyExc_TypeError,
13972 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013973 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013974 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013975 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013976 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013977 if (result == -1)
13978 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013979 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013980}
13981
13982
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013983PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013984 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013985\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013986Return True if S ends with the specified suffix, False otherwise.\n\
13987With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013988With optional end, stop comparing S at that position.\n\
13989suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013990
13991static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013992unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013993 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013994{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013995 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013996 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013997 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013998 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013999 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014000
Jesus Ceaac451502011-04-20 17:09:23 +020014001 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000014002 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014003 if (PyTuple_Check(subobj)) {
14004 Py_ssize_t i;
14005 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014006 substring = PyTuple_GET_ITEM(subobj, i);
14007 if (!PyUnicode_Check(substring)) {
14008 PyErr_Format(PyExc_TypeError,
14009 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020014010 "not %.100s",
14011 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000014012 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014013 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014014 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010014015 if (result == -1)
14016 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014017 if (result) {
14018 Py_RETURN_TRUE;
14019 }
14020 }
14021 Py_RETURN_FALSE;
14022 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014023 if (!PyUnicode_Check(subobj)) {
14024 PyErr_Format(PyExc_TypeError,
14025 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020014026 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000014027 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030014028 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014029 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010014030 if (result == -1)
14031 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000014032 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014033}
14034
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070014035static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014036_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014037{
Victor Stinnereb36fda2015-10-03 01:55:51 +020014038 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
14039 writer->data = PyUnicode_DATA(writer->buffer);
14040
14041 if (!writer->readonly) {
14042 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020014043 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020014044 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014045 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020014046 /* use a value smaller than PyUnicode_1BYTE_KIND() so
14047 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
14048 writer->kind = PyUnicode_WCHAR_KIND;
14049 assert(writer->kind <= PyUnicode_1BYTE_KIND);
14050
Victor Stinner8f674cc2013-04-17 23:02:17 +020014051 /* Copy-on-write mode: set buffer size to 0 so
14052 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
14053 * next write. */
14054 writer->size = 0;
14055 }
Victor Stinner202fdca2012-05-07 12:47:02 +020014056}
14057
Victor Stinnerd3f08822012-05-29 12:57:52 +020014058void
Victor Stinner8f674cc2013-04-17 23:02:17 +020014059_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014060{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014061 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020014062
14063 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020014064 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020014065
14066 /* use a value smaller than PyUnicode_1BYTE_KIND() so
14067 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
14068 writer->kind = PyUnicode_WCHAR_KIND;
14069 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020014070}
14071
Inada Naoki770847a2019-06-24 12:30:24 +090014072// Initialize _PyUnicodeWriter with initial buffer
14073static inline void
14074_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
14075{
14076 memset(writer, 0, sizeof(*writer));
14077 writer->buffer = buffer;
14078 _PyUnicodeWriter_Update(writer);
14079 writer->min_length = writer->size;
14080}
14081
Victor Stinnerd3f08822012-05-29 12:57:52 +020014082int
14083_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
14084 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020014085{
14086 Py_ssize_t newlen;
14087 PyObject *newbuffer;
14088
Victor Stinner2740e462016-09-06 16:58:36 -070014089 assert(maxchar <= MAX_UNICODE);
14090
Victor Stinnerca9381e2015-09-22 00:58:32 +020014091 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020014092 assert((maxchar > writer->maxchar && length >= 0)
14093 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014094
Victor Stinner202fdca2012-05-07 12:47:02 +020014095 if (length > PY_SSIZE_T_MAX - writer->pos) {
14096 PyErr_NoMemory();
14097 return -1;
14098 }
14099 newlen = writer->pos + length;
14100
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014101 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020014102
Victor Stinnerd3f08822012-05-29 12:57:52 +020014103 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020014104 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010014105 if (writer->overallocate
14106 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
14107 /* overallocate to limit the number of realloc() */
14108 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014109 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014110 if (newlen < writer->min_length)
14111 newlen = writer->min_length;
14112
Victor Stinnerd3f08822012-05-29 12:57:52 +020014113 writer->buffer = PyUnicode_New(newlen, maxchar);
14114 if (writer->buffer == NULL)
14115 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014116 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014117 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010014118 if (writer->overallocate
14119 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
14120 /* overallocate to limit the number of realloc() */
14121 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014122 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014123 if (newlen < writer->min_length)
14124 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014125
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014126 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020014127 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030014128 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020014129 newbuffer = PyUnicode_New(newlen, maxchar);
14130 if (newbuffer == NULL)
14131 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014132 _PyUnicode_FastCopyCharacters(newbuffer, 0,
14133 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020014134 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014135 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020014136 }
14137 else {
14138 newbuffer = resize_compact(writer->buffer, newlen);
14139 if (newbuffer == NULL)
14140 return -1;
14141 }
14142 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020014143 }
14144 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014145 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014146 newbuffer = PyUnicode_New(writer->size, maxchar);
14147 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020014148 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014149 _PyUnicode_FastCopyCharacters(newbuffer, 0,
14150 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030014151 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020014152 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020014153 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020014154 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010014155
14156#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020014157}
14158
Victor Stinnerca9381e2015-09-22 00:58:32 +020014159int
14160_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
14161 enum PyUnicode_Kind kind)
14162{
14163 Py_UCS4 maxchar;
14164
14165 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
14166 assert(writer->kind < kind);
14167
14168 switch (kind)
14169 {
14170 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
14171 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
Victor Stinner99768342021-03-17 21:46:53 +010014172 case PyUnicode_4BYTE_KIND: maxchar = MAX_UNICODE; break;
Victor Stinnerca9381e2015-09-22 00:58:32 +020014173 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014174 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020014175 }
14176
14177 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
14178}
14179
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070014180static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014181_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020014182{
Victor Stinner2740e462016-09-06 16:58:36 -070014183 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020014184 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
14185 return -1;
14186 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
14187 writer->pos++;
14188 return 0;
14189}
14190
14191int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014192_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
14193{
14194 return _PyUnicodeWriter_WriteCharInline(writer, ch);
14195}
14196
14197int
Victor Stinnerd3f08822012-05-29 12:57:52 +020014198_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
14199{
14200 Py_UCS4 maxchar;
14201 Py_ssize_t len;
14202
14203 if (PyUnicode_READY(str) == -1)
14204 return -1;
14205 len = PyUnicode_GET_LENGTH(str);
14206 if (len == 0)
14207 return 0;
14208 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
14209 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014210 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010014211 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020014212 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014213 Py_INCREF(str);
14214 writer->buffer = str;
14215 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014216 writer->pos += len;
14217 return 0;
14218 }
14219 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
14220 return -1;
14221 }
14222 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14223 str, 0, len);
14224 writer->pos += len;
14225 return 0;
14226}
14227
Victor Stinnere215d962012-10-06 23:03:36 +020014228int
Victor Stinnercfc4c132013-04-03 01:48:39 +020014229_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
14230 Py_ssize_t start, Py_ssize_t end)
14231{
14232 Py_UCS4 maxchar;
14233 Py_ssize_t len;
14234
14235 if (PyUnicode_READY(str) == -1)
14236 return -1;
14237
14238 assert(0 <= start);
14239 assert(end <= PyUnicode_GET_LENGTH(str));
14240 assert(start <= end);
14241
14242 if (end == 0)
14243 return 0;
14244
14245 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
14246 return _PyUnicodeWriter_WriteStr(writer, str);
14247
14248 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
14249 maxchar = _PyUnicode_FindMaxChar(str, start, end);
14250 else
14251 maxchar = writer->maxchar;
14252 len = end - start;
14253
14254 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
14255 return -1;
14256
14257 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14258 str, start, len);
14259 writer->pos += len;
14260 return 0;
14261}
14262
14263int
Victor Stinner4a587072013-11-19 12:54:53 +010014264_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
14265 const char *ascii, Py_ssize_t len)
14266{
14267 if (len == -1)
14268 len = strlen(ascii);
14269
Andy Lestere6be9b52020-02-11 20:28:35 -060014270 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010014271
14272 if (writer->buffer == NULL && !writer->overallocate) {
14273 PyObject *str;
14274
14275 str = _PyUnicode_FromASCII(ascii, len);
14276 if (str == NULL)
14277 return -1;
14278
14279 writer->readonly = 1;
14280 writer->buffer = str;
14281 _PyUnicodeWriter_Update(writer);
14282 writer->pos += len;
14283 return 0;
14284 }
14285
14286 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
14287 return -1;
14288
14289 switch (writer->kind)
14290 {
14291 case PyUnicode_1BYTE_KIND:
14292 {
14293 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
14294 Py_UCS1 *data = writer->data;
14295
Christian Heimesf051e432016-09-13 20:22:02 +020014296 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010014297 break;
14298 }
14299 case PyUnicode_2BYTE_KIND:
14300 {
14301 _PyUnicode_CONVERT_BYTES(
14302 Py_UCS1, Py_UCS2,
14303 ascii, ascii + len,
14304 (Py_UCS2 *)writer->data + writer->pos);
14305 break;
14306 }
14307 case PyUnicode_4BYTE_KIND:
14308 {
14309 _PyUnicode_CONVERT_BYTES(
14310 Py_UCS1, Py_UCS4,
14311 ascii, ascii + len,
14312 (Py_UCS4 *)writer->data + writer->pos);
14313 break;
14314 }
14315 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014316 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010014317 }
14318
14319 writer->pos += len;
14320 return 0;
14321}
14322
14323int
14324_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
14325 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020014326{
14327 Py_UCS4 maxchar;
14328
Andy Lestere6be9b52020-02-11 20:28:35 -060014329 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020014330 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
14331 return -1;
14332 unicode_write_cstr(writer->buffer, writer->pos, str, len);
14333 writer->pos += len;
14334 return 0;
14335}
14336
Victor Stinnerd3f08822012-05-29 12:57:52 +020014337PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014338_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014339{
Victor Stinner15a0bd32013-07-08 22:29:55 +020014340 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014341
Victor Stinnerd3f08822012-05-29 12:57:52 +020014342 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014343 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020014344 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020014345 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014346
14347 str = writer->buffer;
14348 writer->buffer = NULL;
14349
Victor Stinnerd7b7c742012-06-04 22:52:12 +020014350 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020014351 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
14352 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014353 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014354
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014355 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
14356 PyObject *str2;
14357 str2 = resize_compact(str, writer->pos);
14358 if (str2 == NULL) {
14359 Py_DECREF(str);
14360 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014361 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030014362 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020014363 }
14364
Victor Stinner15a0bd32013-07-08 22:29:55 +020014365 assert(_PyUnicode_CheckConsistency(str, 1));
14366 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020014367}
14368
Victor Stinnerd3f08822012-05-29 12:57:52 +020014369void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020014370_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020014371{
14372 Py_CLEAR(writer->buffer);
14373}
14374
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014375#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000014376
14377PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000014378 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000014379\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014380Return a formatted version of S, using substitutions from args and kwargs.\n\
14381The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000014382
Eric Smith27bbca62010-11-04 17:06:58 +000014383PyDoc_STRVAR(format_map__doc__,
14384 "S.format_map(mapping) -> str\n\
14385\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000014386Return a formatted version of S, using substitutions from mapping.\n\
14387The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000014388
INADA Naoki3ae20562017-01-16 20:41:20 +090014389/*[clinic input]
14390str.__format__ as unicode___format__
14391
14392 format_spec: unicode
14393 /
14394
14395Return a formatted version of the string as described by format_spec.
14396[clinic start generated code]*/
14397
Eric Smith4a7d76d2008-05-30 18:10:19 +000014398static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014399unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090014400/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000014401{
Victor Stinnerd3f08822012-05-29 12:57:52 +020014402 _PyUnicodeWriter writer;
14403 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000014404
Victor Stinnerd3f08822012-05-29 12:57:52 +020014405 if (PyUnicode_READY(self) == -1)
14406 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020014407 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014408 ret = _PyUnicode_FormatAdvancedWriter(&writer,
14409 self, format_spec, 0,
14410 PyUnicode_GET_LENGTH(format_spec));
14411 if (ret == -1) {
14412 _PyUnicodeWriter_Dealloc(&writer);
14413 return NULL;
14414 }
14415 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000014416}
14417
INADA Naoki3ae20562017-01-16 20:41:20 +090014418/*[clinic input]
14419str.__sizeof__ as unicode_sizeof
14420
14421Return the size of the string in memory, in bytes.
14422[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014423
14424static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014425unicode_sizeof_impl(PyObject *self)
14426/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014427{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014428 Py_ssize_t size;
14429
14430 /* If it's a compact object, account for base structure +
14431 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014432 if (PyUnicode_IS_COMPACT_ASCII(self))
14433 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14434 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014435 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014436 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014437 else {
14438 /* If it is a two-block object, account for base object, and
14439 for character block if present. */
14440 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014441 if (_PyUnicode_DATA_ANY(self))
14442 size += (PyUnicode_GET_LENGTH(self) + 1) *
14443 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014444 }
14445 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014446 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014447 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14448 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14449 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14450 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014451
14452 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014453}
14454
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014455static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014456unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014457{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014458 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014459 if (!copy)
14460 return NULL;
14461 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014462}
14463
Guido van Rossumd57fd912000-03-10 22:53:23 +000014464static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014465 UNICODE_ENCODE_METHODDEF
14466 UNICODE_REPLACE_METHODDEF
14467 UNICODE_SPLIT_METHODDEF
14468 UNICODE_RSPLIT_METHODDEF
14469 UNICODE_JOIN_METHODDEF
14470 UNICODE_CAPITALIZE_METHODDEF
14471 UNICODE_CASEFOLD_METHODDEF
14472 UNICODE_TITLE_METHODDEF
14473 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014474 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014475 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014476 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014477 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014478 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014479 UNICODE_LJUST_METHODDEF
14480 UNICODE_LOWER_METHODDEF
14481 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014482 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14483 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014484 UNICODE_RJUST_METHODDEF
14485 UNICODE_RSTRIP_METHODDEF
14486 UNICODE_RPARTITION_METHODDEF
14487 UNICODE_SPLITLINES_METHODDEF
14488 UNICODE_STRIP_METHODDEF
14489 UNICODE_SWAPCASE_METHODDEF
14490 UNICODE_TRANSLATE_METHODDEF
14491 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014492 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14493 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
sweeneydea81849b2020-04-22 17:05:48 -040014494 UNICODE_REMOVEPREFIX_METHODDEF
14495 UNICODE_REMOVESUFFIX_METHODDEF
INADA Naokia49ac992018-01-27 14:06:21 +090014496 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014497 UNICODE_ISLOWER_METHODDEF
14498 UNICODE_ISUPPER_METHODDEF
14499 UNICODE_ISTITLE_METHODDEF
14500 UNICODE_ISSPACE_METHODDEF
14501 UNICODE_ISDECIMAL_METHODDEF
14502 UNICODE_ISDIGIT_METHODDEF
14503 UNICODE_ISNUMERIC_METHODDEF
14504 UNICODE_ISALPHA_METHODDEF
14505 UNICODE_ISALNUM_METHODDEF
14506 UNICODE_ISIDENTIFIER_METHODDEF
14507 UNICODE_ISPRINTABLE_METHODDEF
14508 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014509 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014510 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014511 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014512 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014513 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014514#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014515 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014516 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014517#endif
14518
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014519 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014520 {NULL, NULL}
14521};
14522
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014523static PyObject *
14524unicode_mod(PyObject *v, PyObject *w)
14525{
Brian Curtindfc80e32011-08-10 20:28:54 -050014526 if (!PyUnicode_Check(v))
14527 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014528 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014529}
14530
14531static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014532 0, /*nb_add*/
14533 0, /*nb_subtract*/
14534 0, /*nb_multiply*/
14535 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014536};
14537
Guido van Rossumd57fd912000-03-10 22:53:23 +000014538static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014539 (lenfunc) unicode_length, /* sq_length */
14540 PyUnicode_Concat, /* sq_concat */
14541 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14542 (ssizeargfunc) unicode_getitem, /* sq_item */
14543 0, /* sq_slice */
14544 0, /* sq_ass_item */
14545 0, /* sq_ass_slice */
14546 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014547};
14548
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014549static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014550unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014551{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014552 if (PyUnicode_READY(self) == -1)
14553 return NULL;
14554
Victor Stinnera15e2602020-04-08 02:01:56 +020014555 if (_PyIndex_Check(item)) {
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014556 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014557 if (i == -1 && PyErr_Occurred())
14558 return NULL;
14559 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014560 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014561 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014562 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014563 Py_ssize_t start, stop, step, slicelength, i;
14564 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014565 PyObject *result;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014566 const void *src_data;
14567 void *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014568 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014569 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014570
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014571 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014572 return NULL;
14573 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014574 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14575 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014576
14577 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014578 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014579 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014580 slicelength == PyUnicode_GET_LENGTH(self)) {
14581 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014582 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014583 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014584 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014585 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014586 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014587 src_kind = PyUnicode_KIND(self);
14588 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014589 if (!PyUnicode_IS_ASCII(self)) {
14590 kind_limit = kind_maxchar_limit(src_kind);
14591 max_char = 0;
14592 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14593 ch = PyUnicode_READ(src_kind, src_data, cur);
14594 if (ch > max_char) {
14595 max_char = ch;
14596 if (max_char >= kind_limit)
14597 break;
14598 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014599 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014600 }
Victor Stinner55c99112011-10-13 01:17:06 +020014601 else
14602 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014603 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014604 if (result == NULL)
14605 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014606 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014607 dest_data = PyUnicode_DATA(result);
14608
14609 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014610 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14611 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014612 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014613 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014614 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014615 } else {
14616 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14617 return NULL;
14618 }
14619}
14620
14621static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014622 (lenfunc)unicode_length, /* mp_length */
14623 (binaryfunc)unicode_subscript, /* mp_subscript */
14624 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014625};
14626
Guido van Rossumd57fd912000-03-10 22:53:23 +000014627
Guido van Rossumd57fd912000-03-10 22:53:23 +000014628/* Helpers for PyUnicode_Format() */
14629
Victor Stinnera47082312012-10-04 02:19:54 +020014630struct unicode_formatter_t {
14631 PyObject *args;
14632 int args_owned;
14633 Py_ssize_t arglen, argidx;
14634 PyObject *dict;
14635
14636 enum PyUnicode_Kind fmtkind;
14637 Py_ssize_t fmtcnt, fmtpos;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030014638 const void *fmtdata;
Victor Stinnera47082312012-10-04 02:19:54 +020014639 PyObject *fmtstr;
14640
14641 _PyUnicodeWriter writer;
14642};
14643
14644struct unicode_format_arg_t {
14645 Py_UCS4 ch;
14646 int flags;
14647 Py_ssize_t width;
14648 int prec;
14649 int sign;
14650};
14651
Guido van Rossumd57fd912000-03-10 22:53:23 +000014652static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014653unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014654{
Victor Stinnera47082312012-10-04 02:19:54 +020014655 Py_ssize_t argidx = ctx->argidx;
14656
14657 if (argidx < ctx->arglen) {
14658 ctx->argidx++;
14659 if (ctx->arglen < 0)
14660 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014661 else
Victor Stinnera47082312012-10-04 02:19:54 +020014662 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014663 }
14664 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014665 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014666 return NULL;
14667}
14668
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014669/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014670
Victor Stinnera47082312012-10-04 02:19:54 +020014671/* Format a float into the writer if the writer is not NULL, or into *p_output
14672 otherwise.
14673
14674 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014675static int
Victor Stinnera47082312012-10-04 02:19:54 +020014676formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14677 PyObject **p_output,
14678 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014679{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014680 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014681 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014682 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014683 int prec;
14684 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014685
Guido van Rossumd57fd912000-03-10 22:53:23 +000014686 x = PyFloat_AsDouble(v);
14687 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014688 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014689
Victor Stinnera47082312012-10-04 02:19:54 +020014690 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014691 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014692 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014693
Victor Stinnera47082312012-10-04 02:19:54 +020014694 if (arg->flags & F_ALT)
14695 dtoa_flags = Py_DTSF_ALT;
14696 else
14697 dtoa_flags = 0;
14698 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014699 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014700 return -1;
14701 len = strlen(p);
14702 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014703 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014704 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014705 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014706 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014707 }
14708 else
14709 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014710 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014711 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014712}
14713
Victor Stinnerd0880d52012-04-27 23:40:13 +020014714/* formatlong() emulates the format codes d, u, o, x and X, and
14715 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14716 * Python's regular ints.
14717 * Return value: a new PyUnicodeObject*, or NULL if error.
14718 * The output string is of the form
14719 * "-"? ("0x" | "0X")? digit+
14720 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14721 * set in flags. The case of hex digits will be correct,
14722 * There will be at least prec digits, zero-filled on the left if
14723 * necessary to get that many.
14724 * val object to be converted
14725 * flags bitmask of format flags; only F_ALT is looked at
14726 * prec minimum number of digits; 0-fill on left if needed
14727 * type a character in [duoxX]; u acts the same as d
14728 *
14729 * CAUTION: o, x and X conversions on regular ints can never
14730 * produce a '-' sign, but can for Python's unbounded ints.
14731 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014732PyObject *
14733_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014734{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014735 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014736 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014737 Py_ssize_t i;
14738 int sign; /* 1 if '-', else 0 */
14739 int len; /* number of characters */
14740 Py_ssize_t llen;
14741 int numdigits; /* len == numnondigits + numdigits */
14742 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014743
Victor Stinnerd0880d52012-04-27 23:40:13 +020014744 /* Avoid exceeding SSIZE_T_MAX */
14745 if (prec > INT_MAX-3) {
14746 PyErr_SetString(PyExc_OverflowError,
14747 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014748 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014749 }
14750
14751 assert(PyLong_Check(val));
14752
14753 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014754 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014755 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014756 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014757 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014758 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014759 /* int and int subclasses should print numerically when a numeric */
14760 /* format code is used (see issue18780) */
14761 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014762 break;
14763 case 'o':
14764 numnondigits = 2;
14765 result = PyNumber_ToBase(val, 8);
14766 break;
14767 case 'x':
14768 case 'X':
14769 numnondigits = 2;
14770 result = PyNumber_ToBase(val, 16);
14771 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014772 }
14773 if (!result)
14774 return NULL;
14775
14776 assert(unicode_modifiable(result));
14777 assert(PyUnicode_IS_READY(result));
14778 assert(PyUnicode_IS_ASCII(result));
14779
14780 /* To modify the string in-place, there can only be one reference. */
14781 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014782 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014783 PyErr_BadInternalCall();
14784 return NULL;
14785 }
14786 buf = PyUnicode_DATA(result);
14787 llen = PyUnicode_GET_LENGTH(result);
14788 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014789 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014790 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014791 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014792 return NULL;
14793 }
14794 len = (int)llen;
14795 sign = buf[0] == '-';
14796 numnondigits += sign;
14797 numdigits = len - numnondigits;
14798 assert(numdigits > 0);
14799
14800 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014801 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014802 (type == 'o' || type == 'x' || type == 'X'))) {
14803 assert(buf[sign] == '0');
14804 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14805 buf[sign+1] == 'o');
14806 numnondigits -= 2;
14807 buf += 2;
14808 len -= 2;
14809 if (sign)
14810 buf[0] = '-';
14811 assert(len == numnondigits + numdigits);
14812 assert(numdigits > 0);
14813 }
14814
14815 /* Fill with leading zeroes to meet minimum width. */
14816 if (prec > numdigits) {
14817 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14818 numnondigits + prec);
14819 char *b1;
14820 if (!r1) {
14821 Py_DECREF(result);
14822 return NULL;
14823 }
14824 b1 = PyBytes_AS_STRING(r1);
14825 for (i = 0; i < numnondigits; ++i)
14826 *b1++ = *buf++;
14827 for (i = 0; i < prec - numdigits; i++)
14828 *b1++ = '0';
14829 for (i = 0; i < numdigits; i++)
14830 *b1++ = *buf++;
14831 *b1 = '\0';
14832 Py_DECREF(result);
14833 result = r1;
14834 buf = PyBytes_AS_STRING(result);
14835 len = numnondigits + prec;
14836 }
14837
14838 /* Fix up case for hex conversions. */
14839 if (type == 'X') {
14840 /* Need to convert all lower case letters to upper case.
14841 and need to convert 0x to 0X (and -0x to -0X). */
14842 for (i = 0; i < len; i++)
14843 if (buf[i] >= 'a' && buf[i] <= 'x')
14844 buf[i] -= 'a'-'A';
14845 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014846 if (!PyUnicode_Check(result)
14847 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014848 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014849 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014850 Py_DECREF(result);
14851 result = unicode;
14852 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014853 else if (len != PyUnicode_GET_LENGTH(result)) {
14854 if (PyUnicode_Resize(&result, len) < 0)
14855 Py_CLEAR(result);
14856 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014857 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014858}
14859
Ethan Furmandf3ed242014-01-05 06:50:30 -080014860/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014861 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014862 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014863 * -1 and raise an exception on error */
14864static int
Victor Stinnera47082312012-10-04 02:19:54 +020014865mainformatlong(PyObject *v,
14866 struct unicode_format_arg_t *arg,
14867 PyObject **p_output,
14868 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014869{
14870 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014871 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014872
14873 if (!PyNumber_Check(v))
14874 goto wrongtype;
14875
Ethan Furman9ab74802014-03-21 06:38:46 -070014876 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014877 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014878 if (type == 'o' || type == 'x' || type == 'X') {
Serhiy Storchaka5f4b229d2020-05-28 10:33:45 +030014879 iobj = _PyNumber_Index(v);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014880 }
14881 else {
14882 iobj = PyNumber_Long(v);
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014883 }
14884 if (iobj == NULL ) {
14885 if (PyErr_ExceptionMatches(PyExc_TypeError))
14886 goto wrongtype;
14887 return -1;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014888 }
14889 assert(PyLong_Check(iobj));
14890 }
14891 else {
14892 iobj = v;
14893 Py_INCREF(iobj);
14894 }
14895
14896 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014897 && arg->width == -1 && arg->prec == -1
14898 && !(arg->flags & (F_SIGN | F_BLANK))
14899 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014900 {
14901 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014902 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014903 int base;
14904
Victor Stinnera47082312012-10-04 02:19:54 +020014905 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014906 {
14907 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014908 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014909 case 'd':
14910 case 'i':
14911 case 'u':
14912 base = 10;
14913 break;
14914 case 'o':
14915 base = 8;
14916 break;
14917 case 'x':
14918 case 'X':
14919 base = 16;
14920 break;
14921 }
14922
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014923 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14924 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014925 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014926 }
14927 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014928 return 1;
14929 }
14930
Ethan Furmanb95b5612015-01-23 20:05:18 -080014931 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014932 Py_DECREF(iobj);
14933 if (res == NULL)
14934 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014935 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014936 return 0;
14937
14938wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014939 switch(type)
14940 {
14941 case 'o':
14942 case 'x':
14943 case 'X':
14944 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014945 "%%%c format: an integer is required, "
14946 "not %.200s",
14947 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014948 break;
14949 default:
14950 PyErr_Format(PyExc_TypeError,
Serhiy Storchakae2ec0b22020-10-09 14:14:37 +030014951 "%%%c format: a real number is required, "
Victor Stinner998b8062018-09-12 00:23:25 +020014952 "not %.200s",
14953 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014954 break;
14955 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014956 return -1;
14957}
14958
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014959static Py_UCS4
14960formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014961{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014962 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014963 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014964 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014965 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014966 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014967 goto onError;
14968 }
14969 else {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014970 int overflow;
14971 long x = PyLong_AsLongAndOverflow(v, &overflow);
14972 if (x == -1 && PyErr_Occurred()) {
14973 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014974 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014975 }
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014976 return (Py_UCS4) -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014977 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014978
Victor Stinner8faf8212011-12-08 22:14:11 +010014979 if (x < 0 || x > MAX_UNICODE) {
Serhiy Storchakae67f7db2020-06-29 22:36:41 +030014980 /* this includes an overflow in converting to C long */
Benjamin Peterson29060642009-01-31 22:14:21 +000014981 PyErr_SetString(PyExc_OverflowError,
14982 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014983 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014984 }
14985
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014986 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014987 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014988
Benjamin Peterson29060642009-01-31 22:14:21 +000014989 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014990 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014991 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014992 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014993}
14994
Victor Stinnera47082312012-10-04 02:19:54 +020014995/* Parse options of an argument: flags, width, precision.
14996 Handle also "%(name)" syntax.
14997
14998 Return 0 if the argument has been formatted into arg->str.
14999 Return 1 if the argument has been written into ctx->writer,
15000 Raise an exception and return -1 on error. */
15001static int
15002unicode_format_arg_parse(struct unicode_formatter_t *ctx,
15003 struct unicode_format_arg_t *arg)
15004{
15005#define FORMAT_READ(ctx) \
15006 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
15007
15008 PyObject *v;
15009
Victor Stinnera47082312012-10-04 02:19:54 +020015010 if (arg->ch == '(') {
15011 /* Get argument value from a dictionary. Example: "%(name)s". */
15012 Py_ssize_t keystart;
15013 Py_ssize_t keylen;
15014 PyObject *key;
15015 int pcount = 1;
15016
15017 if (ctx->dict == NULL) {
15018 PyErr_SetString(PyExc_TypeError,
15019 "format requires a mapping");
15020 return -1;
15021 }
15022 ++ctx->fmtpos;
15023 --ctx->fmtcnt;
15024 keystart = ctx->fmtpos;
15025 /* Skip over balanced parentheses */
15026 while (pcount > 0 && --ctx->fmtcnt >= 0) {
15027 arg->ch = FORMAT_READ(ctx);
15028 if (arg->ch == ')')
15029 --pcount;
15030 else if (arg->ch == '(')
15031 ++pcount;
15032 ctx->fmtpos++;
15033 }
15034 keylen = ctx->fmtpos - keystart - 1;
15035 if (ctx->fmtcnt < 0 || pcount > 0) {
15036 PyErr_SetString(PyExc_ValueError,
15037 "incomplete format key");
15038 return -1;
15039 }
15040 key = PyUnicode_Substring(ctx->fmtstr,
15041 keystart, keystart + keylen);
15042 if (key == NULL)
15043 return -1;
15044 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020015045 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020015046 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020015047 }
15048 ctx->args = PyObject_GetItem(ctx->dict, key);
15049 Py_DECREF(key);
15050 if (ctx->args == NULL)
15051 return -1;
15052 ctx->args_owned = 1;
15053 ctx->arglen = -1;
15054 ctx->argidx = -2;
15055 }
15056
15057 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020015058 while (--ctx->fmtcnt >= 0) {
15059 arg->ch = FORMAT_READ(ctx);
15060 ctx->fmtpos++;
15061 switch (arg->ch) {
15062 case '-': arg->flags |= F_LJUST; continue;
15063 case '+': arg->flags |= F_SIGN; continue;
15064 case ' ': arg->flags |= F_BLANK; continue;
15065 case '#': arg->flags |= F_ALT; continue;
15066 case '0': arg->flags |= F_ZERO; continue;
15067 }
15068 break;
15069 }
15070
15071 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020015072 if (arg->ch == '*') {
15073 v = unicode_format_getnextarg(ctx);
15074 if (v == NULL)
15075 return -1;
15076 if (!PyLong_Check(v)) {
15077 PyErr_SetString(PyExc_TypeError,
15078 "* wants int");
15079 return -1;
15080 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020015081 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020015082 if (arg->width == -1 && PyErr_Occurred())
15083 return -1;
15084 if (arg->width < 0) {
15085 arg->flags |= F_LJUST;
15086 arg->width = -arg->width;
15087 }
15088 if (--ctx->fmtcnt >= 0) {
15089 arg->ch = FORMAT_READ(ctx);
15090 ctx->fmtpos++;
15091 }
15092 }
15093 else if (arg->ch >= '0' && arg->ch <= '9') {
15094 arg->width = arg->ch - '0';
15095 while (--ctx->fmtcnt >= 0) {
15096 arg->ch = FORMAT_READ(ctx);
15097 ctx->fmtpos++;
15098 if (arg->ch < '0' || arg->ch > '9')
15099 break;
15100 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
15101 mixing signed and unsigned comparison. Since arg->ch is between
15102 '0' and '9', casting to int is safe. */
15103 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
15104 PyErr_SetString(PyExc_ValueError,
15105 "width too big");
15106 return -1;
15107 }
15108 arg->width = arg->width*10 + (arg->ch - '0');
15109 }
15110 }
15111
15112 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020015113 if (arg->ch == '.') {
15114 arg->prec = 0;
15115 if (--ctx->fmtcnt >= 0) {
15116 arg->ch = FORMAT_READ(ctx);
15117 ctx->fmtpos++;
15118 }
15119 if (arg->ch == '*') {
15120 v = unicode_format_getnextarg(ctx);
15121 if (v == NULL)
15122 return -1;
15123 if (!PyLong_Check(v)) {
15124 PyErr_SetString(PyExc_TypeError,
15125 "* wants int");
15126 return -1;
15127 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020015128 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020015129 if (arg->prec == -1 && PyErr_Occurred())
15130 return -1;
15131 if (arg->prec < 0)
15132 arg->prec = 0;
15133 if (--ctx->fmtcnt >= 0) {
15134 arg->ch = FORMAT_READ(ctx);
15135 ctx->fmtpos++;
15136 }
15137 }
15138 else if (arg->ch >= '0' && arg->ch <= '9') {
15139 arg->prec = arg->ch - '0';
15140 while (--ctx->fmtcnt >= 0) {
15141 arg->ch = FORMAT_READ(ctx);
15142 ctx->fmtpos++;
15143 if (arg->ch < '0' || arg->ch > '9')
15144 break;
15145 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
15146 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020015147 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020015148 return -1;
15149 }
15150 arg->prec = arg->prec*10 + (arg->ch - '0');
15151 }
15152 }
15153 }
15154
15155 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
15156 if (ctx->fmtcnt >= 0) {
15157 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
15158 if (--ctx->fmtcnt >= 0) {
15159 arg->ch = FORMAT_READ(ctx);
15160 ctx->fmtpos++;
15161 }
15162 }
15163 }
15164 if (ctx->fmtcnt < 0) {
15165 PyErr_SetString(PyExc_ValueError,
15166 "incomplete format");
15167 return -1;
15168 }
15169 return 0;
15170
15171#undef FORMAT_READ
15172}
15173
15174/* Format one argument. Supported conversion specifiers:
15175
15176 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080015177 - "i", "d", "u": int or float
15178 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020015179 - "e", "E", "f", "F", "g", "G": float
15180 - "c": int or str (1 character)
15181
Victor Stinner8dbd4212012-12-04 09:30:24 +010015182 When possible, the output is written directly into the Unicode writer
15183 (ctx->writer). A string is created when padding is required.
15184
Victor Stinnera47082312012-10-04 02:19:54 +020015185 Return 0 if the argument has been formatted into *p_str,
15186 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010015187 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020015188static int
15189unicode_format_arg_format(struct unicode_formatter_t *ctx,
15190 struct unicode_format_arg_t *arg,
15191 PyObject **p_str)
15192{
15193 PyObject *v;
15194 _PyUnicodeWriter *writer = &ctx->writer;
15195
15196 if (ctx->fmtcnt == 0)
15197 ctx->writer.overallocate = 0;
15198
Victor Stinnera47082312012-10-04 02:19:54 +020015199 v = unicode_format_getnextarg(ctx);
15200 if (v == NULL)
15201 return -1;
15202
Victor Stinnera47082312012-10-04 02:19:54 +020015203
15204 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020015205 case 's':
15206 case 'r':
15207 case 'a':
15208 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
15209 /* Fast path */
15210 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
15211 return -1;
15212 return 1;
15213 }
15214
15215 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
15216 *p_str = v;
15217 Py_INCREF(*p_str);
15218 }
15219 else {
15220 if (arg->ch == 's')
15221 *p_str = PyObject_Str(v);
15222 else if (arg->ch == 'r')
15223 *p_str = PyObject_Repr(v);
15224 else
15225 *p_str = PyObject_ASCII(v);
15226 }
15227 break;
15228
15229 case 'i':
15230 case 'd':
15231 case 'u':
15232 case 'o':
15233 case 'x':
15234 case 'X':
15235 {
15236 int ret = mainformatlong(v, arg, p_str, writer);
15237 if (ret != 0)
15238 return ret;
15239 arg->sign = 1;
15240 break;
15241 }
15242
15243 case 'e':
15244 case 'E':
15245 case 'f':
15246 case 'F':
15247 case 'g':
15248 case 'G':
15249 if (arg->width == -1 && arg->prec == -1
15250 && !(arg->flags & (F_SIGN | F_BLANK)))
15251 {
15252 /* Fast path */
15253 if (formatfloat(v, arg, NULL, writer) == -1)
15254 return -1;
15255 return 1;
15256 }
15257
15258 arg->sign = 1;
15259 if (formatfloat(v, arg, p_str, NULL) == -1)
15260 return -1;
15261 break;
15262
15263 case 'c':
15264 {
15265 Py_UCS4 ch = formatchar(v);
15266 if (ch == (Py_UCS4) -1)
15267 return -1;
15268 if (arg->width == -1 && arg->prec == -1) {
15269 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020015270 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020015271 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020015272 return 1;
15273 }
15274 *p_str = PyUnicode_FromOrdinal(ch);
15275 break;
15276 }
15277
15278 default:
15279 PyErr_Format(PyExc_ValueError,
15280 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020015281 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020015282 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
15283 (int)arg->ch,
15284 ctx->fmtpos - 1);
15285 return -1;
15286 }
15287 if (*p_str == NULL)
15288 return -1;
15289 assert (PyUnicode_Check(*p_str));
15290 return 0;
15291}
15292
15293static int
15294unicode_format_arg_output(struct unicode_formatter_t *ctx,
15295 struct unicode_format_arg_t *arg,
15296 PyObject *str)
15297{
15298 Py_ssize_t len;
15299 enum PyUnicode_Kind kind;
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015300 const void *pbuf;
Victor Stinnera47082312012-10-04 02:19:54 +020015301 Py_ssize_t pindex;
15302 Py_UCS4 signchar;
15303 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015304 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015305 Py_ssize_t sublen;
15306 _PyUnicodeWriter *writer = &ctx->writer;
15307 Py_UCS4 fill;
15308
15309 fill = ' ';
15310 if (arg->sign && arg->flags & F_ZERO)
15311 fill = '0';
15312
15313 if (PyUnicode_READY(str) == -1)
15314 return -1;
15315
15316 len = PyUnicode_GET_LENGTH(str);
15317 if ((arg->width == -1 || arg->width <= len)
15318 && (arg->prec == -1 || arg->prec >= len)
15319 && !(arg->flags & (F_SIGN | F_BLANK)))
15320 {
15321 /* Fast path */
15322 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
15323 return -1;
15324 return 0;
15325 }
15326
15327 /* Truncate the string for "s", "r" and "a" formats
15328 if the precision is set */
15329 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
15330 if (arg->prec >= 0 && len > arg->prec)
15331 len = arg->prec;
15332 }
15333
15334 /* Adjust sign and width */
15335 kind = PyUnicode_KIND(str);
15336 pbuf = PyUnicode_DATA(str);
15337 pindex = 0;
15338 signchar = '\0';
15339 if (arg->sign) {
15340 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
15341 if (ch == '-' || ch == '+') {
15342 signchar = ch;
15343 len--;
15344 pindex++;
15345 }
15346 else if (arg->flags & F_SIGN)
15347 signchar = '+';
15348 else if (arg->flags & F_BLANK)
15349 signchar = ' ';
15350 else
15351 arg->sign = 0;
15352 }
15353 if (arg->width < len)
15354 arg->width = len;
15355
15356 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015357 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020015358 if (!(arg->flags & F_LJUST)) {
15359 if (arg->sign) {
15360 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015361 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015362 }
15363 else {
15364 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015365 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020015366 }
15367 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015368 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
15369 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070015370 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015371 }
15372
Victor Stinnera47082312012-10-04 02:19:54 +020015373 buflen = arg->width;
15374 if (arg->sign && len == arg->width)
15375 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020015376 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020015377 return -1;
15378
15379 /* Write the sign if needed */
15380 if (arg->sign) {
15381 if (fill != ' ') {
15382 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15383 writer->pos += 1;
15384 }
15385 if (arg->width > len)
15386 arg->width--;
15387 }
15388
15389 /* Write the numeric prefix for "x", "X" and "o" formats
15390 if the alternate form is used.
15391 For example, write "0x" for the "%#x" format. */
15392 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15393 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15394 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
15395 if (fill != ' ') {
15396 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15397 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15398 writer->pos += 2;
15399 pindex += 2;
15400 }
15401 arg->width -= 2;
15402 if (arg->width < 0)
15403 arg->width = 0;
15404 len -= 2;
15405 }
15406
15407 /* Pad left with the fill character if needed */
15408 if (arg->width > len && !(arg->flags & F_LJUST)) {
15409 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015410 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015411 writer->pos += sublen;
15412 arg->width = len;
15413 }
15414
15415 /* If padding with spaces: write sign if needed and/or numeric prefix if
15416 the alternate form is used */
15417 if (fill == ' ') {
15418 if (arg->sign) {
15419 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15420 writer->pos += 1;
15421 }
15422 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15423 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15424 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15425 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15426 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15427 writer->pos += 2;
15428 pindex += 2;
15429 }
15430 }
15431
15432 /* Write characters */
15433 if (len) {
15434 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15435 str, pindex, len);
15436 writer->pos += len;
15437 }
15438
15439 /* Pad right with the fill character if needed */
15440 if (arg->width > len) {
15441 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015442 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015443 writer->pos += sublen;
15444 }
15445 return 0;
15446}
15447
15448/* Helper of PyUnicode_Format(): format one arg.
15449 Return 0 on success, raise an exception and return -1 on error. */
15450static int
15451unicode_format_arg(struct unicode_formatter_t *ctx)
15452{
15453 struct unicode_format_arg_t arg;
15454 PyObject *str;
15455 int ret;
15456
Victor Stinner8dbd4212012-12-04 09:30:24 +010015457 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015458 if (arg.ch == '%') {
15459 ctx->fmtpos++;
15460 ctx->fmtcnt--;
15461 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15462 return -1;
15463 return 0;
15464 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015465 arg.flags = 0;
15466 arg.width = -1;
15467 arg.prec = -1;
15468 arg.sign = 0;
15469 str = NULL;
15470
Victor Stinnera47082312012-10-04 02:19:54 +020015471 ret = unicode_format_arg_parse(ctx, &arg);
15472 if (ret == -1)
15473 return -1;
15474
15475 ret = unicode_format_arg_format(ctx, &arg, &str);
15476 if (ret == -1)
15477 return -1;
15478
15479 if (ret != 1) {
15480 ret = unicode_format_arg_output(ctx, &arg, str);
15481 Py_DECREF(str);
15482 if (ret == -1)
15483 return -1;
15484 }
15485
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015486 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015487 PyErr_SetString(PyExc_TypeError,
15488 "not all arguments converted during string formatting");
15489 return -1;
15490 }
15491 return 0;
15492}
15493
Alexander Belopolsky40018472011-02-26 01:02:56 +000015494PyObject *
15495PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015496{
Victor Stinnera47082312012-10-04 02:19:54 +020015497 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015498
Guido van Rossumd57fd912000-03-10 22:53:23 +000015499 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015500 PyErr_BadInternalCall();
15501 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015502 }
Victor Stinnera47082312012-10-04 02:19:54 +020015503
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015504 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015505 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015506
15507 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015508 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15509 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15510 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15511 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015512
Victor Stinner8f674cc2013-04-17 23:02:17 +020015513 _PyUnicodeWriter_Init(&ctx.writer);
15514 ctx.writer.min_length = ctx.fmtcnt + 100;
15515 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015516
Guido van Rossumd57fd912000-03-10 22:53:23 +000015517 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015518 ctx.arglen = PyTuple_Size(args);
15519 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015520 }
15521 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015522 ctx.arglen = -1;
15523 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015524 }
Victor Stinnera47082312012-10-04 02:19:54 +020015525 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015526 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015527 ctx.dict = args;
15528 else
15529 ctx.dict = NULL;
15530 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015531
Victor Stinnera47082312012-10-04 02:19:54 +020015532 while (--ctx.fmtcnt >= 0) {
15533 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015534 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015535
15536 nonfmtpos = ctx.fmtpos++;
15537 while (ctx.fmtcnt >= 0 &&
15538 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15539 ctx.fmtpos++;
15540 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015541 }
Victor Stinnera47082312012-10-04 02:19:54 +020015542 if (ctx.fmtcnt < 0) {
15543 ctx.fmtpos--;
15544 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015545 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015546
Victor Stinnercfc4c132013-04-03 01:48:39 +020015547 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15548 nonfmtpos, ctx.fmtpos) < 0)
15549 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015550 }
15551 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015552 ctx.fmtpos++;
15553 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015554 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015555 }
15556 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015557
Victor Stinnera47082312012-10-04 02:19:54 +020015558 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015559 PyErr_SetString(PyExc_TypeError,
15560 "not all arguments converted during string formatting");
15561 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015562 }
15563
Victor Stinnera47082312012-10-04 02:19:54 +020015564 if (ctx.args_owned) {
15565 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015566 }
Victor Stinnera47082312012-10-04 02:19:54 +020015567 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015568
Benjamin Peterson29060642009-01-31 22:14:21 +000015569 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015570 _PyUnicodeWriter_Dealloc(&ctx.writer);
15571 if (ctx.args_owned) {
15572 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015573 }
15574 return NULL;
15575}
15576
Jeremy Hylton938ace62002-07-17 16:30:39 +000015577static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015578unicode_subtype_new(PyTypeObject *type, PyObject *unicode);
15579
15580/*[clinic input]
15581@classmethod
15582str.__new__ as unicode_new
15583
15584 object as x: object = NULL
15585 encoding: str = NULL
15586 errors: str = NULL
15587
15588[clinic start generated code]*/
Guido van Rossume023fe02001-08-30 03:12:59 +000015589
Tim Peters6d6c1a32001-08-02 04:15:00 +000015590static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015591unicode_new_impl(PyTypeObject *type, PyObject *x, const char *encoding,
15592 const char *errors)
15593/*[clinic end generated code: output=fc72d4878b0b57e9 input=e81255e5676d174e]*/
Tim Peters6d6c1a32001-08-02 04:15:00 +000015594{
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015595 PyObject *unicode;
15596 if (x == NULL) {
15597 unicode = unicode_new_empty();
15598 }
15599 else if (encoding == NULL && errors == NULL) {
15600 unicode = PyObject_Str(x);
15601 }
15602 else {
15603 unicode = PyUnicode_FromEncodedObject(x, encoding, errors);
15604 }
Tim Peters6d6c1a32001-08-02 04:15:00 +000015605
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015606 if (unicode != NULL && type != &PyUnicode_Type) {
15607 Py_SETREF(unicode, unicode_subtype_new(type, unicode));
15608 }
15609 return unicode;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015610}
15611
Guido van Rossume023fe02001-08-30 03:12:59 +000015612static PyObject *
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015613unicode_subtype_new(PyTypeObject *type, PyObject *unicode)
Guido van Rossume023fe02001-08-30 03:12:59 +000015614{
Serhiy Storchaka12f43342020-07-20 15:53:55 +030015615 PyObject *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015616 Py_ssize_t length, char_size;
15617 int share_wstr, share_utf8;
15618 unsigned int kind;
15619 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015620
Benjamin Peterson14339b62009-01-31 16:36:08 +000015621 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner910337b2011-10-03 03:20:16 +020015622 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015623 if (PyUnicode_READY(unicode) == -1) {
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015624 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015625 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015626
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015627 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015628 if (self == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015629 return NULL;
15630 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015631 kind = PyUnicode_KIND(unicode);
15632 length = PyUnicode_GET_LENGTH(unicode);
15633
15634 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015635#ifdef Py_DEBUG
15636 _PyUnicode_HASH(self) = -1;
15637#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015638 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015639#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015640 _PyUnicode_STATE(self).interned = 0;
15641 _PyUnicode_STATE(self).kind = kind;
15642 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015643 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015644 _PyUnicode_STATE(self).ready = 1;
15645 _PyUnicode_WSTR(self) = NULL;
15646 _PyUnicode_UTF8_LENGTH(self) = 0;
15647 _PyUnicode_UTF8(self) = NULL;
15648 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015649 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015650
15651 share_utf8 = 0;
15652 share_wstr = 0;
15653 if (kind == PyUnicode_1BYTE_KIND) {
15654 char_size = 1;
15655 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15656 share_utf8 = 1;
15657 }
15658 else if (kind == PyUnicode_2BYTE_KIND) {
15659 char_size = 2;
15660 if (sizeof(wchar_t) == 2)
15661 share_wstr = 1;
15662 }
15663 else {
15664 assert(kind == PyUnicode_4BYTE_KIND);
15665 char_size = 4;
15666 if (sizeof(wchar_t) == 4)
15667 share_wstr = 1;
15668 }
15669
15670 /* Ensure we won't overflow the length. */
15671 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15672 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015673 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015674 }
Victor Stinner32bd68c2020-12-01 10:37:39 +010015675 data = PyObject_Malloc((length + 1) * char_size);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015676 if (data == NULL) {
15677 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015678 goto onError;
15679 }
15680
Victor Stinnerc3c74152011-10-02 20:39:55 +020015681 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015682 if (share_utf8) {
15683 _PyUnicode_UTF8_LENGTH(self) = length;
15684 _PyUnicode_UTF8(self) = data;
15685 }
15686 if (share_wstr) {
15687 _PyUnicode_WSTR_LENGTH(self) = length;
15688 _PyUnicode_WSTR(self) = (wchar_t *)data;
15689 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015690
Christian Heimesf051e432016-09-13 20:22:02 +020015691 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015692 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015693 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015694#ifdef Py_DEBUG
15695 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15696#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +010015697 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015698
15699onError:
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015700 Py_DECREF(self);
15701 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015702}
15703
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015704PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015705"str(object='') -> str\n\
15706str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015707\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015708Create a new string object from the given object. If encoding or\n\
15709errors is specified, then the object must expose a data buffer\n\
15710that will be decoded using the given encoding and error handler.\n\
15711Otherwise, returns the result of object.__str__() (if defined)\n\
15712or repr(object).\n\
15713encoding defaults to sys.getdefaultencoding().\n\
15714errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015715
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015716static PyObject *unicode_iter(PyObject *seq);
15717
Guido van Rossumd57fd912000-03-10 22:53:23 +000015718PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015719 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015720 "str", /* tp_name */
15721 sizeof(PyUnicodeObject), /* tp_basicsize */
15722 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015723 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015724 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015725 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015726 0, /* tp_getattr */
15727 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015728 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015729 unicode_repr, /* tp_repr */
15730 &unicode_as_number, /* tp_as_number */
15731 &unicode_as_sequence, /* tp_as_sequence */
15732 &unicode_as_mapping, /* tp_as_mapping */
15733 (hashfunc) unicode_hash, /* tp_hash*/
15734 0, /* tp_call*/
15735 (reprfunc) unicode_str, /* tp_str */
15736 PyObject_GenericGetAttr, /* tp_getattro */
15737 0, /* tp_setattro */
15738 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015739 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Brandt Bucher145bf262021-02-26 14:51:55 -080015740 Py_TPFLAGS_UNICODE_SUBCLASS |
15741 _Py_TPFLAGS_MATCH_SELF, /* tp_flags */
Bupfc93bd42018-06-19 03:59:55 -050015742 unicode_doc, /* tp_doc */
15743 0, /* tp_traverse */
15744 0, /* tp_clear */
15745 PyUnicode_RichCompare, /* tp_richcompare */
15746 0, /* tp_weaklistoffset */
15747 unicode_iter, /* tp_iter */
15748 0, /* tp_iternext */
15749 unicode_methods, /* tp_methods */
15750 0, /* tp_members */
15751 0, /* tp_getset */
15752 &PyBaseObject_Type, /* tp_base */
15753 0, /* tp_dict */
15754 0, /* tp_descr_get */
15755 0, /* tp_descr_set */
15756 0, /* tp_dictoffset */
15757 0, /* tp_init */
15758 0, /* tp_alloc */
15759 unicode_new, /* tp_new */
15760 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015761};
15762
15763/* Initialize the Unicode implementation */
15764
Victor Stinner331a6a52019-05-27 16:39:22 +020015765PyStatus
Victor Stinnerbcb094b2021-02-19 15:10:45 +010015766_PyUnicode_Init(PyInterpreterState *interp)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015767{
Victor Stinnerbcb094b2021-02-19 15:10:45 +010015768 struct _Py_unicode_state *state = &interp->unicode;
Victor Stinner91698d82020-06-25 14:07:40 +020015769 if (unicode_create_empty_string_singleton(state) < 0) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015770 return _PyStatus_NO_MEMORY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015771 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015772
Victor Stinnerbcb094b2021-02-19 15:10:45 +010015773 if (_Py_IsMainInterpreter(interp)) {
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015774 /* initialize the linebreak bloom filter */
Victor Stinner442ad742021-04-02 15:28:13 +020015775 const Py_UCS2 linebreak[] = {
15776 0x000A, /* LINE FEED */
15777 0x000D, /* CARRIAGE RETURN */
15778 0x001C, /* FILE SEPARATOR */
15779 0x001D, /* GROUP SEPARATOR */
15780 0x001E, /* RECORD SEPARATOR */
15781 0x0085, /* NEXT LINE */
15782 0x2028, /* LINE SEPARATOR */
15783 0x2029, /* PARAGRAPH SEPARATOR */
15784 };
Victor Stinnerf363d0a2020-06-24 00:10:40 +020015785 bloom_linebreak = make_bloom_mask(
15786 PyUnicode_2BYTE_KIND, linebreak,
15787 Py_ARRAY_LENGTH(linebreak));
Victor Stinner442ad742021-04-02 15:28:13 +020015788 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015789
Victor Stinner442ad742021-04-02 15:28:13 +020015790 return _PyStatus_OK();
15791}
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015792
Victor Stinner442ad742021-04-02 15:28:13 +020015793
15794PyStatus
15795_PyUnicode_InitTypes(void)
15796{
15797 if (PyType_Ready(&PyUnicode_Type) < 0) {
15798 return _PyStatus_ERR("Can't initialize unicode type");
15799 }
15800 if (PyType_Ready(&EncodingMapType) < 0) {
15801 return _PyStatus_ERR("Can't initialize encoding map type");
15802 }
15803 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
15804 return _PyStatus_ERR("Can't initialize field name iterator type");
15805 }
15806 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
15807 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015808 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015809 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015810}
15811
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015812
Walter Dörwald16807132007-05-25 13:52:07 +000015813void
15814PyUnicode_InternInPlace(PyObject **p)
15815{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015816 PyObject *s = *p;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015817#ifdef Py_DEBUG
15818 assert(s != NULL);
15819 assert(_PyUnicode_CHECK(s));
15820#else
Victor Stinner607b1022020-05-05 18:50:30 +020015821 if (s == NULL || !PyUnicode_Check(s)) {
Victor Stinner4fae54c2011-10-03 02:01:52 +020015822 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015823 }
Victor Stinner4fae54c2011-10-03 02:01:52 +020015824#endif
Victor Stinner607b1022020-05-05 18:50:30 +020015825
Benjamin Peterson14339b62009-01-31 16:36:08 +000015826 /* If it's a subclass, we don't really know what putting
15827 it in the interned dict might do. */
Victor Stinner607b1022020-05-05 18:50:30 +020015828 if (!PyUnicode_CheckExact(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015829 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015830 }
15831
15832 if (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015833 return;
Victor Stinner607b1022020-05-05 18:50:30 +020015834 }
15835
Victor Stinner666ecfb2020-07-02 01:19:57 +020015836 if (PyUnicode_READY(s) == -1) {
15837 PyErr_Clear();
15838 return;
15839 }
15840
Victor Stinnerea251802020-12-26 02:58:33 +010015841 struct _Py_unicode_state *state = get_unicode_state();
15842 if (state->interned == NULL) {
15843 state->interned = PyDict_New();
15844 if (state->interned == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015845 PyErr_Clear(); /* Don't leave an exception */
15846 return;
15847 }
15848 }
Victor Stinner607b1022020-05-05 18:50:30 +020015849
Victor Stinnerea251802020-12-26 02:58:33 +010015850 PyObject *t = PyDict_SetDefault(state->interned, s, s);
Berker Peksagced8d4c2016-07-25 04:40:39 +030015851 if (t == NULL) {
15852 PyErr_Clear();
15853 return;
15854 }
Victor Stinner607b1022020-05-05 18:50:30 +020015855
Berker Peksagced8d4c2016-07-25 04:40:39 +030015856 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015857 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015858 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015859 return;
15860 }
Victor Stinner607b1022020-05-05 18:50:30 +020015861
Victor Stinner3549ca32020-07-03 16:59:12 +020015862 /* The two references in interned dict (key and value) are not counted by
15863 refcnt. unicode_dealloc() and _PyUnicode_ClearInterned() take care of
15864 this. */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015865 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015866 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015867}
15868
Victor Stinnerea251802020-12-26 02:58:33 +010015869
Walter Dörwald16807132007-05-25 13:52:07 +000015870void
15871PyUnicode_InternImmortal(PyObject **p)
15872{
Victor Stinner583ee5a2020-10-02 14:49:00 +020015873 if (PyErr_WarnEx(PyExc_DeprecationWarning,
15874 "PyUnicode_InternImmortal() is deprecated; "
15875 "use PyUnicode_InternInPlace() instead", 1) < 0)
15876 {
15877 // The function has no return value, the exception cannot
15878 // be reported to the caller, so just log it.
15879 PyErr_WriteUnraisable(NULL);
15880 }
15881
Benjamin Peterson14339b62009-01-31 16:36:08 +000015882 PyUnicode_InternInPlace(p);
15883 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015884 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015885 Py_INCREF(*p);
15886 }
Walter Dörwald16807132007-05-25 13:52:07 +000015887}
15888
15889PyObject *
15890PyUnicode_InternFromString(const char *cp)
15891{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015892 PyObject *s = PyUnicode_FromString(cp);
15893 if (s == NULL)
15894 return NULL;
15895 PyUnicode_InternInPlace(&s);
15896 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015897}
15898
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015899
Victor Stinner666ecfb2020-07-02 01:19:57 +020015900void
Victor Stinnerbcb094b2021-02-19 15:10:45 +010015901_PyUnicode_ClearInterned(PyInterpreterState *interp)
Walter Dörwald16807132007-05-25 13:52:07 +000015902{
Victor Stinnerbcb094b2021-02-19 15:10:45 +010015903 struct _Py_unicode_state *state = &interp->unicode;
Victor Stinnerea251802020-12-26 02:58:33 +010015904 if (state->interned == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015905 return;
15906 }
Victor Stinnerea251802020-12-26 02:58:33 +010015907 assert(PyDict_CheckExact(state->interned));
Victor Stinner666ecfb2020-07-02 01:19:57 +020015908
15909 /* Interned unicode strings are not forcibly deallocated; rather, we give
15910 them their stolen references back, and then clear and DECREF the
15911 interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015912
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015913#ifdef INTERNED_STATS
Victor Stinnerea251802020-12-26 02:58:33 +010015914 fprintf(stderr, "releasing %zd interned strings\n",
15915 PyDict_GET_SIZE(state->interned));
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015916
15917 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015918#endif
Victor Stinnerea251802020-12-26 02:58:33 +010015919 Py_ssize_t pos = 0;
15920 PyObject *s, *ignored_value;
15921 while (PyDict_Next(state->interned, &pos, &s, &ignored_value)) {
Victor Stinner666ecfb2020-07-02 01:19:57 +020015922 assert(PyUnicode_IS_READY(s));
15923
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015924 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015925 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015926 Py_SET_REFCNT(s, Py_REFCNT(s) + 1);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015927#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015928 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015929#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015930 break;
15931 case SSTATE_INTERNED_MORTAL:
Victor Stinner3549ca32020-07-03 16:59:12 +020015932 // Restore the two references (key and value) ignored
15933 // by PyUnicode_InternInPlace().
Victor Stinnerc96a61e2020-06-08 01:39:47 +020015934 Py_SET_REFCNT(s, Py_REFCNT(s) + 2);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015935#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015936 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015937#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015938 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015939 case SSTATE_NOT_INTERNED:
15940 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015941 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015942 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015943 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015944 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015945 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015946#ifdef INTERNED_STATS
Victor Stinnerd36cf5f2020-06-10 18:38:05 +020015947 fprintf(stderr,
15948 "total size of all interned strings: %zd/%zd mortal/immortal\n",
15949 mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015950#endif
Victor Stinner666ecfb2020-07-02 01:19:57 +020015951
Victor Stinnerea251802020-12-26 02:58:33 +010015952 PyDict_Clear(state->interned);
15953 Py_CLEAR(state->interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015954}
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015955
15956
15957/********************* Unicode Iterator **************************/
15958
15959typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015960 PyObject_HEAD
15961 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015962 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015963} unicodeiterobject;
15964
15965static void
15966unicodeiter_dealloc(unicodeiterobject *it)
15967{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015968 _PyObject_GC_UNTRACK(it);
15969 Py_XDECREF(it->it_seq);
15970 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015971}
15972
15973static int
15974unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15975{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015976 Py_VISIT(it->it_seq);
15977 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015978}
15979
15980static PyObject *
15981unicodeiter_next(unicodeiterobject *it)
15982{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015983 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015984
Benjamin Peterson14339b62009-01-31 16:36:08 +000015985 assert(it != NULL);
15986 seq = it->it_seq;
15987 if (seq == NULL)
15988 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015989 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015990
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015991 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15992 int kind = PyUnicode_KIND(seq);
Serhiy Storchakacd8295f2020-04-11 10:48:40 +030015993 const void *data = PyUnicode_DATA(seq);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015994 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15995 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015996 if (item != NULL)
15997 ++it->it_index;
15998 return item;
15999 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016000
Benjamin Peterson14339b62009-01-31 16:36:08 +000016001 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030016002 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000016003 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016004}
16005
16006static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053016007unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016008{
Benjamin Peterson14339b62009-01-31 16:36:08 +000016009 Py_ssize_t len = 0;
16010 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020016011 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000016012 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016013}
16014
16015PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
16016
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000016017static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053016018unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000016019{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020016020 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000016021 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020016022 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000016023 it->it_seq, it->it_index);
16024 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020016025 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000016026 if (u == NULL)
16027 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020016028 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000016029 }
16030}
16031
16032PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
16033
16034static PyObject *
16035unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
16036{
16037 Py_ssize_t index = PyLong_AsSsize_t(state);
16038 if (index == -1 && PyErr_Occurred())
16039 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000016040 if (it->it_seq != NULL) {
16041 if (index < 0)
16042 index = 0;
16043 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
16044 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
16045 it->it_index = index;
16046 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000016047 Py_RETURN_NONE;
16048}
16049
16050PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
16051
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016052static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000016053 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000016054 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000016055 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
16056 reduce_doc},
16057 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
16058 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000016059 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016060};
16061
16062PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000016063 PyVarObject_HEAD_INIT(&PyType_Type, 0)
16064 "str_iterator", /* tp_name */
16065 sizeof(unicodeiterobject), /* tp_basicsize */
16066 0, /* tp_itemsize */
16067 /* methods */
16068 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020016069 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000016070 0, /* tp_getattr */
16071 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020016072 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000016073 0, /* tp_repr */
16074 0, /* tp_as_number */
16075 0, /* tp_as_sequence */
16076 0, /* tp_as_mapping */
16077 0, /* tp_hash */
16078 0, /* tp_call */
16079 0, /* tp_str */
16080 PyObject_GenericGetAttr, /* tp_getattro */
16081 0, /* tp_setattro */
16082 0, /* tp_as_buffer */
16083 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
16084 0, /* tp_doc */
16085 (traverseproc)unicodeiter_traverse, /* tp_traverse */
16086 0, /* tp_clear */
16087 0, /* tp_richcompare */
16088 0, /* tp_weaklistoffset */
16089 PyObject_SelfIter, /* tp_iter */
16090 (iternextfunc)unicodeiter_next, /* tp_iternext */
16091 unicodeiter_methods, /* tp_methods */
16092 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016093};
16094
16095static PyObject *
16096unicode_iter(PyObject *seq)
16097{
Benjamin Peterson14339b62009-01-31 16:36:08 +000016098 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016099
Benjamin Peterson14339b62009-01-31 16:36:08 +000016100 if (!PyUnicode_Check(seq)) {
16101 PyErr_BadInternalCall();
16102 return NULL;
16103 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020016104 if (PyUnicode_READY(seq) == -1)
16105 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000016106 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
16107 if (it == NULL)
16108 return NULL;
16109 it->it_index = 0;
16110 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020016111 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000016112 _PyObject_GC_TRACK(it);
16113 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000016114}
16115
Victor Stinner709d23d2019-05-02 14:56:30 -040016116static int
16117encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016118{
Victor Stinner709d23d2019-05-02 14:56:30 -040016119 int res;
16120 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
16121 if (res == -2) {
16122 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
16123 return -1;
16124 }
16125 if (res < 0) {
16126 PyErr_NoMemory();
16127 return -1;
16128 }
16129 return 0;
16130}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016131
Victor Stinner709d23d2019-05-02 14:56:30 -040016132
16133static int
16134config_get_codec_name(wchar_t **config_encoding)
16135{
16136 char *encoding;
16137 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
16138 return -1;
16139 }
16140
16141 PyObject *name_obj = NULL;
16142 PyObject *codec = _PyCodec_Lookup(encoding);
16143 PyMem_RawFree(encoding);
16144
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016145 if (!codec)
16146 goto error;
16147
16148 name_obj = PyObject_GetAttrString(codec, "name");
16149 Py_CLEAR(codec);
16150 if (!name_obj) {
16151 goto error;
16152 }
16153
Victor Stinner709d23d2019-05-02 14:56:30 -040016154 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
16155 Py_DECREF(name_obj);
16156 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016157 goto error;
16158 }
16159
Victor Stinner709d23d2019-05-02 14:56:30 -040016160 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
16161 if (raw_wname == NULL) {
16162 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016163 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040016164 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016165 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016166
16167 PyMem_RawFree(*config_encoding);
16168 *config_encoding = raw_wname;
16169
16170 PyMem_Free(wname);
16171 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016172
16173error:
16174 Py_XDECREF(codec);
16175 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040016176 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016177}
16178
16179
Victor Stinner331a6a52019-05-27 16:39:22 +020016180static PyStatus
Victor Stinnerbcb094b2021-02-19 15:10:45 +010016181init_stdio_encoding(PyInterpreterState *interp)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016182{
Victor Stinner709d23d2019-05-02 14:56:30 -040016183 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerbcb094b2021-02-19 15:10:45 +010016184 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016185 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016186 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016187 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016188 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016189 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016190}
16191
16192
Victor Stinner709d23d2019-05-02 14:56:30 -040016193static int
16194init_fs_codec(PyInterpreterState *interp)
16195{
Victor Stinnerda7933e2020-04-13 03:04:28 +020016196 const PyConfig *config = _PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016197
16198 _Py_error_handler error_handler;
16199 error_handler = get_error_handler_wide(config->filesystem_errors);
16200 if (error_handler == _Py_ERROR_UNKNOWN) {
Christian Claussdcfbe4f2021-10-07 16:31:33 +020016201 PyErr_SetString(PyExc_RuntimeError, "unknown filesystem error handler");
Victor Stinner709d23d2019-05-02 14:56:30 -040016202 return -1;
16203 }
16204
16205 char *encoding, *errors;
16206 if (encode_wstr_utf8(config->filesystem_encoding,
16207 &encoding,
16208 "filesystem_encoding") < 0) {
16209 return -1;
16210 }
16211
16212 if (encode_wstr_utf8(config->filesystem_errors,
16213 &errors,
16214 "filesystem_errors") < 0) {
16215 PyMem_RawFree(encoding);
16216 return -1;
16217 }
16218
Victor Stinner3d17c042020-05-14 01:48:38 +020016219 struct _Py_unicode_fs_codec *fs_codec = &interp->unicode.fs_codec;
16220 PyMem_RawFree(fs_codec->encoding);
16221 fs_codec->encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016222 /* encoding has been normalized by init_fs_encoding() */
Victor Stinner3d17c042020-05-14 01:48:38 +020016223 fs_codec->utf8 = (strcmp(encoding, "utf-8") == 0);
16224 PyMem_RawFree(fs_codec->errors);
16225 fs_codec->errors = errors;
16226 fs_codec->error_handler = error_handler;
Victor Stinner709d23d2019-05-02 14:56:30 -040016227
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016228#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner3d17c042020-05-14 01:48:38 +020016229 assert(fs_codec->utf8 == 1);
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016230#endif
16231
Victor Stinner709d23d2019-05-02 14:56:30 -040016232 /* At this point, PyUnicode_EncodeFSDefault() and
16233 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
16234 the C implementation of the filesystem encoding. */
16235
16236 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
16237 global configuration variables. */
Victor Stinner3d17c042020-05-14 01:48:38 +020016238 if (_Py_SetFileSystemEncoding(fs_codec->encoding,
16239 fs_codec->errors) < 0) {
Victor Stinner709d23d2019-05-02 14:56:30 -040016240 PyErr_NoMemory();
16241 return -1;
16242 }
16243 return 0;
16244}
16245
16246
Victor Stinner331a6a52019-05-27 16:39:22 +020016247static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016248init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016249{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016250 PyInterpreterState *interp = tstate->interp;
16251
Victor Stinner709d23d2019-05-02 14:56:30 -040016252 /* Update the filesystem encoding to the normalized Python codec name.
16253 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
16254 (Python codec name). */
Victor Stinnerda7933e2020-04-13 03:04:28 +020016255 PyConfig *config = (PyConfig*)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016256 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016257 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016258 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016259 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016260 }
16261
Victor Stinner709d23d2019-05-02 14:56:30 -040016262 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020016263 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016264 }
Victor Stinner331a6a52019-05-27 16:39:22 +020016265 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016266}
16267
16268
Victor Stinner331a6a52019-05-27 16:39:22 +020016269PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020016270_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016271{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020016272 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020016273 if (_PyStatus_EXCEPTION(status)) {
16274 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016275 }
16276
Victor Stinnerbcb094b2021-02-19 15:10:45 +010016277 return init_stdio_encoding(tstate->interp);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040016278}
16279
16280
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016281static void
Victor Stinner3d17c042020-05-14 01:48:38 +020016282_PyUnicode_FiniEncodings(struct _Py_unicode_fs_codec *fs_codec)
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016283{
Victor Stinner3d17c042020-05-14 01:48:38 +020016284 PyMem_RawFree(fs_codec->encoding);
16285 fs_codec->encoding = NULL;
16286 fs_codec->utf8 = 0;
16287 PyMem_RawFree(fs_codec->errors);
16288 fs_codec->errors = NULL;
16289 fs_codec->error_handler = _Py_ERROR_UNKNOWN;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016290}
16291
16292
Victor Stinner709d23d2019-05-02 14:56:30 -040016293#ifdef MS_WINDOWS
16294int
16295_PyUnicode_EnableLegacyWindowsFSEncoding(void)
16296{
Victor Stinner81a7be32020-04-14 15:14:01 +020016297 PyInterpreterState *interp = _PyInterpreterState_GET();
Victor Stinnerda7933e2020-04-13 03:04:28 +020016298 PyConfig *config = (PyConfig *)_PyInterpreterState_GetConfig(interp);
Victor Stinner709d23d2019-05-02 14:56:30 -040016299
16300 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
16301 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
16302 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
16303 if (encoding == NULL || errors == NULL) {
16304 PyMem_RawFree(encoding);
16305 PyMem_RawFree(errors);
16306 PyErr_NoMemory();
16307 return -1;
16308 }
16309
16310 PyMem_RawFree(config->filesystem_encoding);
16311 config->filesystem_encoding = encoding;
16312 PyMem_RawFree(config->filesystem_errors);
16313 config->filesystem_errors = errors;
16314
16315 return init_fs_codec(interp);
16316}
16317#endif
16318
16319
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016320void
Victor Stinnerbcb094b2021-02-19 15:10:45 +010016321_PyUnicode_Fini(PyInterpreterState *interp)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016322{
Victor Stinnerbcb094b2021-02-19 15:10:45 +010016323 struct _Py_unicode_state *state = &interp->unicode;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016324
Victor Stinnerea251802020-12-26 02:58:33 +010016325 // _PyUnicode_ClearInterned() must be called before
16326 assert(state->interned == NULL);
16327
16328 _PyUnicode_FiniEncodings(&state->fs_codec);
16329
Victor Stinnerf4507232020-12-26 20:26:08 +010016330 unicode_clear_identifiers(state);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016331
Victor Stinner2f9ada92020-06-24 02:22:21 +020016332 for (Py_ssize_t i = 0; i < 256; i++) {
16333 Py_CLEAR(state->latin1[i]);
16334 }
Victor Stinnerea251802020-12-26 02:58:33 +010016335 Py_CLEAR(state->empty_string);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016336}
16337
16338
Georg Brandl66c221e2010-10-14 07:04:07 +000016339/* A _string module, to export formatter_parser and formatter_field_name_split
16340 to the string.Formatter class implemented in Python. */
16341
16342static PyMethodDef _string_methods[] = {
16343 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16344 METH_O, PyDoc_STR("split the argument as a field name")},
16345 {"formatter_parser", (PyCFunction) formatter_parser,
16346 METH_O, PyDoc_STR("parse the argument as a format string")},
16347 {NULL, NULL}
16348};
16349
16350static struct PyModuleDef _string_module = {
16351 PyModuleDef_HEAD_INIT,
Victor Stinnerbb083d32020-09-08 15:33:08 +020016352 .m_name = "_string",
16353 .m_doc = PyDoc_STR("string helper module"),
16354 .m_size = 0,
16355 .m_methods = _string_methods,
Georg Brandl66c221e2010-10-14 07:04:07 +000016356};
16357
16358PyMODINIT_FUNC
16359PyInit__string(void)
16360{
Victor Stinnerbb083d32020-09-08 15:33:08 +020016361 return PyModuleDef_Init(&_string_module);
Georg Brandl66c221e2010-10-14 07:04:07 +000016362}
16363
16364
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016365#ifdef __cplusplus
16366}
16367#endif